cobweb 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +1 -1
- data/lib/cobweb.rb +7 -7
- data/lib/cobweb_version.rb +1 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +2 -2
- data/spec/cobweb/cobweb_job_spec.rb +2 -0
- data/spec/cobweb/cobweb_spec.rb +11 -24
- data/spec/spec_helper.rb +6 -5
- metadata +80 -25
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -139,21 +139,20 @@ class Cobweb
|
|
139
139
|
@http.read_timeout = @options[:timeout].to_i
|
140
140
|
@http.open_timeout = @options[:timeout].to_i
|
141
141
|
begin
|
142
|
-
|
142
|
+
puts "Retrieving #{url }... " unless @options[:quiet]
|
143
143
|
request_options={}
|
144
144
|
request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
|
145
145
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
146
146
|
|
147
147
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
148
|
-
|
149
148
|
response = @http.request request
|
150
|
-
|
149
|
+
|
151
150
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
152
151
|
puts "redirected... " unless @options[:quiet]
|
153
|
-
|
152
|
+
|
154
153
|
# get location to redirect to
|
155
154
|
uri = UriHelper.join_no_fragment(uri, response['location'])
|
156
|
-
|
155
|
+
|
157
156
|
# decrement redirect limit
|
158
157
|
redirect_limit = redirect_limit - 1
|
159
158
|
|
@@ -162,9 +161,10 @@ class Cobweb
|
|
162
161
|
|
163
162
|
# get the content from redirect location
|
164
163
|
content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
165
|
-
|
166
|
-
content[:redirect_through] = [] if content[:redirect_through].nil?
|
164
|
+
|
165
|
+
content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
|
167
166
|
content[:redirect_through].insert(0, url)
|
167
|
+
content[:url] = content[:redirect_through].last
|
168
168
|
|
169
169
|
content[:response_time] = Time.now.to_f - request_time
|
170
170
|
else
|
data/lib/cobweb_version.rb
CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
describe CobwebModule::Crawl, :local_only => true do
|
4
4
|
|
5
5
|
before(:each) do
|
6
|
-
@local_redis = {:host => "
|
7
|
-
@remote_redis = {:host => "
|
6
|
+
@local_redis = {:host => "localhost", :port => 6379}
|
7
|
+
@remote_redis = {:host => "remote-redis", :port => 6379}
|
8
8
|
|
9
9
|
@request = {:crawl_id => "test_crawl_id"}
|
10
10
|
end
|
@@ -9,6 +9,8 @@ describe Cobweb, :local_only => true, :disabled => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
+
`mkdir tmp`
|
13
|
+
`mkdir tmp/pids`
|
12
14
|
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
15
|
puts "Workers Started."
|
14
16
|
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -90,34 +90,21 @@ describe Cobweb do
|
|
90
90
|
before(:each) do
|
91
91
|
@base_url = "http://redirect-me.com/redirect.html"
|
92
92
|
@cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
93
|
-
|
94
|
-
@mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
|
95
|
-
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
96
|
-
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
97
|
-
|
98
93
|
end
|
99
94
|
|
100
|
-
it "should
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
#content[:url].should == "http://redirect-me.com/redirect.html"
|
109
|
-
#content[:redirect_through].length.should == 2
|
110
|
-
#content[:mime_type].should == "text/html"
|
111
|
-
#content[:body].should == "asdf"
|
95
|
+
it "should return final page from redirects" do
|
96
|
+
content = @cobweb.get(@base_url)
|
97
|
+
content.should be_an_instance_of Hash
|
98
|
+
content[:url].should == "http://redirected-to.com/redirected.html"
|
99
|
+
content[:mime_type].should == "text/html"
|
100
|
+
content[:body].should == "asdf"
|
101
|
+
end
|
102
|
+
it "should return the path followed" do
|
112
103
|
|
113
|
-
|
114
|
-
|
115
|
-
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
116
|
-
#
|
117
|
-
#content = @cobweb.get(@base_url)
|
118
|
-
#content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
104
|
+
content = @cobweb.get(@base_url)
|
105
|
+
content[:redirect_through].should == ["http://redirect-me.com/redirect.html", "http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
119
106
|
|
120
|
-
|
107
|
+
end
|
121
108
|
it "should not follow with redirect disabled" do
|
122
109
|
@cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
|
123
110
|
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
data/spec/spec_helper.rb
CHANGED
@@ -9,7 +9,7 @@ APP_ROOT = File.expand_path(File.dirname(__FILE__) + '/../')
|
|
9
9
|
|
10
10
|
RSpec.configure do |config|
|
11
11
|
|
12
|
-
|
12
|
+
if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
|
13
13
|
config.filter_run_excluding :local_only => true
|
14
14
|
end
|
15
15
|
|
@@ -63,9 +63,10 @@ RSpec.configure do |config|
|
|
63
63
|
|
64
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
65
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
66
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html",
|
67
|
-
Net::HTTP::Get.stub!(:new).with("/robots.txt",
|
68
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html",
|
66
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", an_instance_of(Hash)).and_return(@mock_http_robot_request)
|
68
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request2)
|
69
|
+
Net::HTTP::Get.stub!(:new).with("/redirected.html", an_instance_of(Hash)).and_return(@mock_http_request)
|
69
70
|
|
70
71
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
71
72
|
|
@@ -77,7 +78,7 @@ RSpec.configure do |config|
|
|
77
78
|
@mock_http_client.stub!(:open_timeout=).and_return(nil)
|
78
79
|
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
79
80
|
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
80
|
-
@mock_http_client.stub!(:port).and_return("80
|
81
|
+
@mock_http_client.stub!(:port).and_return("80")
|
81
82
|
|
82
83
|
@mock_http_robot_response.stub!(:code).and_return(200)
|
83
84
|
@mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: redis
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: nokogiri
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ! '>='
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '0'
|
44
54
|
type: :runtime
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: addressable
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ! '>='
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: '0'
|
55
70
|
type: :runtime
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rspec
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :runtime
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: awesome_print
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: '0'
|
77
102
|
type: :runtime
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: sinatra
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :runtime
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: thin
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,10 +133,15 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :runtime
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
- !ruby/object:Gem::Dependency
|
103
143
|
name: haml
|
104
|
-
requirement:
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
105
145
|
none: false
|
106
146
|
requirements:
|
107
147
|
- - ! '>='
|
@@ -109,10 +149,15 @@ dependencies:
|
|
109
149
|
version: '0'
|
110
150
|
type: :runtime
|
111
151
|
prerelease: false
|
112
|
-
version_requirements:
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
113
158
|
- !ruby/object:Gem::Dependency
|
114
159
|
name: namespaced_redis
|
115
|
-
requirement:
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
116
161
|
none: false
|
117
162
|
requirements:
|
118
163
|
- - ! '>='
|
@@ -120,10 +165,15 @@ dependencies:
|
|
120
165
|
version: '0'
|
121
166
|
type: :runtime
|
122
167
|
prerelease: false
|
123
|
-
version_requirements:
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
124
174
|
- !ruby/object:Gem::Dependency
|
125
175
|
name: json
|
126
|
-
requirement:
|
176
|
+
requirement: !ruby/object:Gem::Requirement
|
127
177
|
none: false
|
128
178
|
requirements:
|
129
179
|
- - ! '>='
|
@@ -131,7 +181,12 @@ dependencies:
|
|
131
181
|
version: '0'
|
132
182
|
type: :runtime
|
133
183
|
prerelease: false
|
134
|
-
version_requirements:
|
184
|
+
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ! '>='
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '0'
|
135
190
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
191
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
192
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -514,7 +569,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
514
569
|
version: '0'
|
515
570
|
requirements: []
|
516
571
|
rubyforge_project:
|
517
|
-
rubygems_version: 1.8.
|
572
|
+
rubygems_version: 1.8.24
|
518
573
|
signing_key:
|
519
574
|
specification_version: 3
|
520
575
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|