cobweb 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +7 -7
- data/lib/cobweb_version.rb +1 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +2 -2
- data/spec/cobweb/cobweb_job_spec.rb +2 -0
- data/spec/cobweb/cobweb_spec.rb +11 -24
- data/spec/spec_helper.rb +6 -5
- metadata +80 -25
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -139,21 +139,20 @@ class Cobweb
|
|
139
139
|
@http.read_timeout = @options[:timeout].to_i
|
140
140
|
@http.open_timeout = @options[:timeout].to_i
|
141
141
|
begin
|
142
|
-
|
142
|
+
puts "Retrieving #{url }... " unless @options[:quiet]
|
143
143
|
request_options={}
|
144
144
|
request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
|
145
145
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
146
146
|
|
147
147
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
148
|
-
|
149
148
|
response = @http.request request
|
150
|
-
|
149
|
+
|
151
150
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
152
151
|
puts "redirected... " unless @options[:quiet]
|
153
|
-
|
152
|
+
|
154
153
|
# get location to redirect to
|
155
154
|
uri = UriHelper.join_no_fragment(uri, response['location'])
|
156
|
-
|
155
|
+
|
157
156
|
# decrement redirect limit
|
158
157
|
redirect_limit = redirect_limit - 1
|
159
158
|
|
@@ -162,9 +161,10 @@ class Cobweb
|
|
162
161
|
|
163
162
|
# get the content from redirect location
|
164
163
|
content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
165
|
-
|
166
|
-
content[:redirect_through] = [] if content[:redirect_through].nil?
|
164
|
+
|
165
|
+
content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
|
167
166
|
content[:redirect_through].insert(0, url)
|
167
|
+
content[:url] = content[:redirect_through].last
|
168
168
|
|
169
169
|
content[:response_time] = Time.now.to_f - request_time
|
170
170
|
else
|
data/lib/cobweb_version.rb
CHANGED
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
describe CobwebModule::Crawl, :local_only => true do
|
4
4
|
|
5
5
|
before(:each) do
|
6
|
-
@local_redis = {:host => "
|
7
|
-
@remote_redis = {:host => "
|
6
|
+
@local_redis = {:host => "localhost", :port => 6379}
|
7
|
+
@remote_redis = {:host => "remote-redis", :port => 6379}
|
8
8
|
|
9
9
|
@request = {:crawl_id => "test_crawl_id"}
|
10
10
|
end
|
@@ -9,6 +9,8 @@ describe Cobweb, :local_only => true, :disabled => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
+
`mkdir tmp`
|
13
|
+
`mkdir tmp/pids`
|
12
14
|
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
15
|
puts "Workers Started."
|
14
16
|
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -90,34 +90,21 @@ describe Cobweb do
|
|
90
90
|
before(:each) do
|
91
91
|
@base_url = "http://redirect-me.com/redirect.html"
|
92
92
|
@cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
93
|
-
|
94
|
-
@mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
|
95
|
-
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
96
|
-
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
97
|
-
|
98
93
|
end
|
99
94
|
|
100
|
-
it "should
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
#content[:url].should == "http://redirect-me.com/redirect.html"
|
109
|
-
#content[:redirect_through].length.should == 2
|
110
|
-
#content[:mime_type].should == "text/html"
|
111
|
-
#content[:body].should == "asdf"
|
95
|
+
it "should return final page from redirects" do
|
96
|
+
content = @cobweb.get(@base_url)
|
97
|
+
content.should be_an_instance_of Hash
|
98
|
+
content[:url].should == "http://redirected-to.com/redirected.html"
|
99
|
+
content[:mime_type].should == "text/html"
|
100
|
+
content[:body].should == "asdf"
|
101
|
+
end
|
102
|
+
it "should return the path followed" do
|
112
103
|
|
113
|
-
|
114
|
-
|
115
|
-
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
116
|
-
#
|
117
|
-
#content = @cobweb.get(@base_url)
|
118
|
-
#content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
104
|
+
content = @cobweb.get(@base_url)
|
105
|
+
content[:redirect_through].should == ["http://redirect-me.com/redirect.html", "http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
119
106
|
|
120
|
-
|
107
|
+
end
|
121
108
|
it "should not follow with redirect disabled" do
|
122
109
|
@cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
|
123
110
|
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
data/spec/spec_helper.rb
CHANGED
@@ -9,7 +9,7 @@ APP_ROOT = File.expand_path(File.dirname(__FILE__) + '/../')
|
|
9
9
|
|
10
10
|
RSpec.configure do |config|
|
11
11
|
|
12
|
-
|
12
|
+
if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
|
13
13
|
config.filter_run_excluding :local_only => true
|
14
14
|
end
|
15
15
|
|
@@ -63,9 +63,10 @@ RSpec.configure do |config|
|
|
63
63
|
|
64
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
65
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
66
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html",
|
67
|
-
Net::HTTP::Get.stub!(:new).with("/robots.txt",
|
68
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html",
|
66
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", an_instance_of(Hash)).and_return(@mock_http_robot_request)
|
68
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request2)
|
69
|
+
Net::HTTP::Get.stub!(:new).with("/redirected.html", an_instance_of(Hash)).and_return(@mock_http_request)
|
69
70
|
|
70
71
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
71
72
|
|
@@ -77,7 +78,7 @@ RSpec.configure do |config|
|
|
77
78
|
@mock_http_client.stub!(:open_timeout=).and_return(nil)
|
78
79
|
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
79
80
|
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
80
|
-
@mock_http_client.stub!(:port).and_return("80
|
81
|
+
@mock_http_client.stub!(:port).and_return("80")
|
81
82
|
|
82
83
|
@mock_http_robot_response.stub!(:code).and_return(200)
|
83
84
|
@mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: redis
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: nokogiri
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ! '>='
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '0'
|
44
54
|
type: :runtime
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: addressable
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ! '>='
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: '0'
|
55
70
|
type: :runtime
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rspec
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :runtime
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: awesome_print
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: '0'
|
77
102
|
type: :runtime
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: sinatra
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :runtime
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: thin
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,10 +133,15 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :runtime
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
- !ruby/object:Gem::Dependency
|
103
143
|
name: haml
|
104
|
-
requirement:
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
105
145
|
none: false
|
106
146
|
requirements:
|
107
147
|
- - ! '>='
|
@@ -109,10 +149,15 @@ dependencies:
|
|
109
149
|
version: '0'
|
110
150
|
type: :runtime
|
111
151
|
prerelease: false
|
112
|
-
version_requirements:
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
113
158
|
- !ruby/object:Gem::Dependency
|
114
159
|
name: namespaced_redis
|
115
|
-
requirement:
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
116
161
|
none: false
|
117
162
|
requirements:
|
118
163
|
- - ! '>='
|
@@ -120,10 +165,15 @@ dependencies:
|
|
120
165
|
version: '0'
|
121
166
|
type: :runtime
|
122
167
|
prerelease: false
|
123
|
-
version_requirements:
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
124
174
|
- !ruby/object:Gem::Dependency
|
125
175
|
name: json
|
126
|
-
requirement:
|
176
|
+
requirement: !ruby/object:Gem::Requirement
|
127
177
|
none: false
|
128
178
|
requirements:
|
129
179
|
- - ! '>='
|
@@ -131,7 +181,12 @@ dependencies:
|
|
131
181
|
version: '0'
|
132
182
|
type: :runtime
|
133
183
|
prerelease: false
|
134
|
-
version_requirements:
|
184
|
+
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ! '>='
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '0'
|
135
190
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
191
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
192
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -514,7 +569,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
514
569
|
version: '0'
|
515
570
|
requirements: []
|
516
571
|
rubyforge_project:
|
517
|
-
rubygems_version: 1.8.
|
572
|
+
rubygems_version: 1.8.24
|
518
573
|
signing_key:
|
519
574
|
specification_version: 3
|
520
575
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|