cobweb 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +2 -0
- data/lib/cobweb.rb +26 -6
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +15 -18
- data/lib/crawl_worker.rb +7 -6
- data/spec/cobweb/cobweb_crawl_spec.rb +6 -6
- data/spec/cobweb/cobweb_links_spec.rb +36 -36
- data/spec/cobweb/robots_spec.rb +13 -12
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2fa16c36a203417d0f90f884634267227698d83839580c59982d15b01bb1f495
|
4
|
+
data.tar.gz: a262f311869ccd9696fd377c1bd2de04ab32f19b1709df1d3321b80e9d4e8517
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b32dce38f7563f75dce48d4e87dfa083c918b1f3702a2f95159939fb09a30742ea27be5dd3e1551451f268df86cbbf9614603f030cfc114cc421fe17cda7950
|
7
|
+
data.tar.gz: dc8dfcdde6e157b73d4ff785d95afbb2d71ff3180169685762683a2b6aeeb37d1b47743608f76e3d2b944f37b12a3cab0cfd0a4bc1bb05c902d447b0f648709e
|
data/README.textile
CHANGED
@@ -189,6 +189,8 @@ bc. crawl = CobwebCrawlHelper.new(options)
|
|
189
189
|
|
190
190
|
h2. Contributing/Testing
|
191
191
|
|
192
|
+
<p>Firstly, you could <a href="https://www.buymeacoffee.com/Z2yRGl3CX" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: auto !important;width: auto !important;" ></a></p>
|
193
|
+
|
192
194
|
Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
|
193
195
|
|
194
196
|
Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
|
data/lib/cobweb.rb
CHANGED
@@ -77,8 +77,17 @@ class Cobweb
|
|
77
77
|
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
78
78
|
uri = Addressable::URI.parse(base_url)
|
79
79
|
@options[:internal_urls] = []
|
80
|
-
|
81
|
-
@options[:
|
80
|
+
|
81
|
+
if @options[:treat_https_as_http_to]
|
82
|
+
@options[:internal_urls] << ["http://", uri.host, "/*"].join
|
83
|
+
@options[:internal_urls] << ["http://", uri.host, ":", uri.inferred_port, "/*"].join
|
84
|
+
@options[:internal_urls] << ["https://", uri.host, "/*"].join
|
85
|
+
@options[:internal_urls] << ["https://", uri.host, ":", uri.inferred_port, "/*"].join
|
86
|
+
else
|
87
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
88
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
|
89
|
+
end
|
90
|
+
|
82
91
|
end
|
83
92
|
|
84
93
|
request.merge!(@options)
|
@@ -86,7 +95,10 @@ class Cobweb
|
|
86
95
|
@redis.set("original_base_url", base_url)
|
87
96
|
@redis.hset "statistics", "queued_at", DateTime.now
|
88
97
|
@redis.set("crawl-counter", 0)
|
89
|
-
@
|
98
|
+
queue_counter = @options[:seed_urls].count + 1
|
99
|
+
puts "queue_counter being init to #{queue_counter}"
|
100
|
+
@redis.set("queue-counter", queue_counter)
|
101
|
+
|
90
102
|
|
91
103
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
92
104
|
|
@@ -99,6 +111,12 @@ class Cobweb
|
|
99
111
|
Resque.enqueue(CrawlJob, request)
|
100
112
|
elsif @options[:queue_system] == :sidekiq
|
101
113
|
CrawlWorker.perform_async(request)
|
114
|
+
@options[:seed_urls].map{|url|
|
115
|
+
new_request = request.clone
|
116
|
+
new_request[:url] = url
|
117
|
+
CrawlWorker.perform_async(new_request)
|
118
|
+
}
|
119
|
+
|
102
120
|
else
|
103
121
|
raise "Unknown queue system: #{content_request[:queue_system]}"
|
104
122
|
end
|
@@ -170,7 +188,7 @@ class Cobweb
|
|
170
188
|
begin
|
171
189
|
puts "Retrieving #{uri}... " unless @options[:quiet]
|
172
190
|
request_options={}
|
173
|
-
request_options['Cookie']= options[:cookies] if options[:cookies]
|
191
|
+
request_options['Cookie']= options[:cookies].map{|k,v| [k,v].join("=") }.join("&") if options[:cookies]
|
174
192
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
175
193
|
|
176
194
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
@@ -185,6 +203,8 @@ class Cobweb
|
|
185
203
|
|
186
204
|
response = @http.request request
|
187
205
|
|
206
|
+
cookies = Hash[get_cookies(response).to_s.split("; ").map{|s| [CGI.unescape(s.split("=")[0]), s.split("=")[1]]}].merge(options[:cookies] || {})
|
207
|
+
|
188
208
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
189
209
|
|
190
210
|
# get location to redirect to
|
@@ -195,8 +215,7 @@ class Cobweb
|
|
195
215
|
redirect_limit = redirect_limit - 1
|
196
216
|
|
197
217
|
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
198
|
-
|
199
|
-
|
218
|
+
|
200
219
|
# get the content from redirect location
|
201
220
|
content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
202
221
|
|
@@ -213,6 +232,7 @@ class Cobweb
|
|
213
232
|
# create the content container
|
214
233
|
content[:url] = uri.to_s
|
215
234
|
content[:status_code] = response.code.to_i
|
235
|
+
content[:cookies] = cookies
|
216
236
|
content[:mime_type] = ""
|
217
237
|
content[:mime_type] = response.content_type.split(";")[0].strip unless response.content_type.nil?
|
218
238
|
if !response["Content-Type"].nil? && response["Content-Type"].include?(";")
|
data/lib/cobweb_links.rb
CHANGED
@@ -28,16 +28,16 @@ class CobwebLinks
|
|
28
28
|
|
29
29
|
# Returns true if the link is matched to an internal_url and not matched to an external_url
|
30
30
|
def internal?(link)
|
31
|
-
|
31
|
+
@internal_patterns.any?{|pattern| link.match(pattern)} && !@external_patterns.any?{|pattern| link.match(pattern)}
|
32
32
|
end
|
33
33
|
|
34
34
|
# Returns true if the link is matched to an external_url or not matched to an internal_url
|
35
35
|
def external?(link)
|
36
|
-
|
36
|
+
!@internal_patterns.any?{|pattern| link.match(pattern)} || @external_patterns.any?{|pattern| link.match(pattern)}
|
37
37
|
end
|
38
38
|
|
39
39
|
def matches_external?(link)
|
40
|
-
|
40
|
+
@external_patterns.any?{|pattern| link.match(pattern)}
|
41
41
|
end
|
42
42
|
|
43
43
|
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -70,6 +70,8 @@ module CobwebModule
|
|
70
70
|
@redis.sadd("currently_running", @options[:url])
|
71
71
|
@stats.update_status("Retrieving #{@options[:url]}...")
|
72
72
|
@content = Cobweb.new(@options).get(@options[:url], @options)
|
73
|
+
@cookies = @content[:cookies]
|
74
|
+
|
73
75
|
update_counters
|
74
76
|
|
75
77
|
if @options[:url] == @redis.get("original_base_url")
|
@@ -117,7 +119,6 @@ module CobwebModule
|
|
117
119
|
document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
118
120
|
#get rid of duplicate links in the same page.
|
119
121
|
document_links.uniq!
|
120
|
-
|
121
122
|
# select the link if its internal
|
122
123
|
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
123
124
|
|
@@ -125,24 +126,16 @@ module CobwebModule
|
|
125
126
|
if @options[:treat_https_as_http]
|
126
127
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
127
128
|
end
|
128
|
-
|
129
129
|
# reject the link if we've crawled it or queued it
|
130
130
|
internal_links.reject! { |link| already_handled?(link)}
|
131
131
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
unless link.nil?
|
138
|
-
@redis.sadd "queued", link
|
139
|
-
increment_queue_counter
|
140
|
-
end
|
141
|
-
else
|
142
|
-
debug_puts "Cannot enqueue new content as crawl has been cancelled."
|
143
|
-
end
|
144
|
-
end
|
132
|
+
if status != CobwebCrawlHelper::CANCELLED && internal_links.present?
|
133
|
+
lock("internal-links") do
|
134
|
+
@redis.sadd "queued", internal_links
|
135
|
+
@redis.incrby "queue-counter", internal_links.count
|
136
|
+
|
145
137
|
end
|
138
|
+
internal_links.map{|link| yield(link) if block_given? }
|
146
139
|
end
|
147
140
|
|
148
141
|
if @options[:store_inbound_links]
|
@@ -154,6 +147,10 @@ module CobwebModule
|
|
154
147
|
end
|
155
148
|
end
|
156
149
|
|
150
|
+
def cookies
|
151
|
+
@cookies
|
152
|
+
end
|
153
|
+
|
157
154
|
def content
|
158
155
|
raise "Content is not available" if @content.nil?
|
159
156
|
CobwebModule::CrawlObject.new(@content, @options)
|
@@ -182,7 +179,7 @@ module CobwebModule
|
|
182
179
|
end
|
183
180
|
|
184
181
|
def to_be_processed?
|
185
|
-
!finished? && within_process_limits?
|
182
|
+
!finished? && within_process_limits?
|
186
183
|
end
|
187
184
|
|
188
185
|
def process(&block)
|
@@ -275,7 +272,7 @@ module CobwebModule
|
|
275
272
|
debug_puts "LOCK:#{key}:#{set_nx}"
|
276
273
|
while !set_nx
|
277
274
|
debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
278
|
-
sleep 0.
|
275
|
+
sleep 0.5
|
279
276
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
280
277
|
end
|
281
278
|
|
@@ -285,7 +282,7 @@ module CobwebModule
|
|
285
282
|
result = yield
|
286
283
|
ensure
|
287
284
|
@redis.del("#{key}_lock")
|
288
|
-
|
285
|
+
debug_puts "LOCK RELEASED [#{key}]"
|
289
286
|
end
|
290
287
|
result
|
291
288
|
end
|
data/lib/crawl_worker.rb
CHANGED
@@ -20,6 +20,8 @@ class CrawlWorker
|
|
20
20
|
# setup the crawl class to manage the crawl of this object
|
21
21
|
@crawl = CobwebModule::Crawl.new(content_request)
|
22
22
|
|
23
|
+
content_request.merge!(:cookies => @crawl.cookies)
|
24
|
+
|
23
25
|
# update the counters and then perform the get, returns false if we are outwith limits
|
24
26
|
if @crawl.retrieve
|
25
27
|
|
@@ -28,13 +30,12 @@ class CrawlWorker
|
|
28
30
|
|
29
31
|
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
30
32
|
@crawl.process_links do |link|
|
31
|
-
|
32
|
-
|
33
|
+
puts "Looking to add #{link}.."
|
34
|
+
# @crawl.lock("queue_links") do
|
33
35
|
# enqueue the links to sidekiq
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
end
|
36
|
+
@crawl.debug_puts "QUEUED LINK: #{link}"
|
37
|
+
enqueue_content(content_request, link)
|
38
|
+
# end
|
38
39
|
end
|
39
40
|
|
40
41
|
if @crawl.to_be_processed?
|
@@ -28,22 +28,22 @@ describe CobwebModule::Crawl, :local_only => true do
|
|
28
28
|
end
|
29
29
|
it "should connect to the local redis" do
|
30
30
|
if @remote
|
31
|
-
@local.redis.exists("test_redis").should
|
31
|
+
@local.redis.exists("test_redis").should be_falsey
|
32
32
|
@local.redis.set("test_redis", 1)
|
33
|
-
@local.redis.exists("test_redis").should
|
33
|
+
@local.redis.exists("test_redis").should be_truthy
|
34
34
|
|
35
|
-
@remote.redis.exists("test_redis").should
|
35
|
+
@remote.redis.exists("test_redis").should be_falsey
|
36
36
|
else
|
37
37
|
puts "WARNING: can't connect to remote redis"
|
38
38
|
end
|
39
39
|
end
|
40
40
|
it "should connect to the remote redis" do
|
41
41
|
if @remote
|
42
|
-
@remote.redis.exists("test_redis").should
|
42
|
+
@remote.redis.exists("test_redis").should be_falsey
|
43
43
|
@remote.redis.set("test_redis", 1)
|
44
|
-
@remote.redis.exists("test_redis").should
|
44
|
+
@remote.redis.exists("test_redis").should be_truthy
|
45
45
|
|
46
|
-
@local.redis.exists("test_redis").should
|
46
|
+
@local.redis.exists("test_redis").should be_falsey
|
47
47
|
else
|
48
48
|
puts "WARNING: can't connect to remote redis"
|
49
49
|
end
|
@@ -41,69 +41,69 @@ describe CobwebLinks do
|
|
41
41
|
describe "internal and external links" do
|
42
42
|
it "should only return internal links" do
|
43
43
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
44
|
-
cobweb_links.internal?("http://domain_one.com/pageone.html").should
|
45
|
-
cobweb_links.internal?("http://domain_one.com/pagetwo.html").should
|
44
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_truthy
|
45
|
+
cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_truthy
|
46
46
|
end
|
47
47
|
it "should not return external links" do
|
48
48
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
49
|
-
cobweb_links.external?("http://domain_one.com/pageone.html").should
|
50
|
-
cobweb_links.external?("http://domain_two.com/pageone.html").should
|
51
|
-
cobweb_links.external?("http://external.com/pageone.html").should
|
49
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_falsey
|
50
|
+
cobweb_links.external?("http://domain_two.com/pageone.html").should be_truthy
|
51
|
+
cobweb_links.external?("http://external.com/pageone.html").should be_truthy
|
52
52
|
end
|
53
53
|
it "should override internal links with external links" do
|
54
54
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
|
55
|
-
cobweb_links.internal?("http://domain_one.com/pageone.html").should
|
56
|
-
cobweb_links.external?("http://domain_one.com/pageone.html").should
|
57
|
-
cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should
|
58
|
-
cobweb_links.external?("http://domain_one.com/blog/pageone.html").should
|
59
|
-
cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should
|
60
|
-
cobweb_links.external?("http://domain_two.com/blog/pageone.html").should
|
55
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_truthy
|
56
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_falsey
|
57
|
+
cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_falsey
|
58
|
+
cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_truthy
|
59
|
+
cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_falsey
|
60
|
+
cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_truthy
|
61
61
|
end
|
62
62
|
end
|
63
63
|
it "should only match from beginning of url" do
|
64
64
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
|
65
|
-
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should
|
66
|
-
cobweb_links.internal?("http://www.domain_two.com/pageone.html").should
|
67
|
-
cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should
|
68
|
-
cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should
|
65
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
|
66
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_falsey
|
67
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_truthy
|
68
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_falsey
|
69
69
|
end
|
70
70
|
|
71
71
|
describe "using wildcards" do
|
72
72
|
it "should match internal links with wildcards" do
|
73
73
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
|
74
|
-
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should
|
75
|
-
cobweb_links.internal?("http://images.domain_one.com/logo.png").should
|
76
|
-
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should
|
74
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
|
75
|
+
cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_truthy
|
76
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_falsey
|
77
77
|
end
|
78
78
|
it "should match external links with wildcards" do
|
79
79
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
|
80
|
-
cobweb_links.external?("http://www.domain_one.com/pageone.html").should
|
81
|
-
cobweb_links.external?("http://images.domain_one.com/logo.png").should
|
82
|
-
cobweb_links.external?("http://blog.domain_one.com/pageone.html").should
|
80
|
+
cobweb_links.external?("http://www.domain_one.com/pageone.html").should be_truthy
|
81
|
+
cobweb_links.external?("http://images.domain_one.com/logo.png").should be_truthy
|
82
|
+
cobweb_links.external?("http://blog.domain_one.com/pageone.html").should be_truthy
|
83
83
|
end
|
84
84
|
it "should match external links with querystring parameters" do
|
85
85
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.ford.com/"], :external_urls => ["http://*.ford.com/*?*view=print"])
|
86
|
-
cobweb_links.external?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should
|
87
|
-
cobweb_links.internal?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should
|
88
|
-
cobweb_links.external?("http://corporate.ford.com/news-center/view=print/pr-doug-scott2658-marketing-manager-31039").should
|
86
|
+
cobweb_links.external?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_truthy
|
87
|
+
cobweb_links.internal?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_falsey
|
88
|
+
cobweb_links.external?("http://corporate.ford.com/news-center/view=print/pr-doug-scott2658-marketing-manager-31039").should be_truthy
|
89
89
|
end
|
90
90
|
it "should allow multiple wildcards" do
|
91
91
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
|
92
|
-
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should
|
93
|
-
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should
|
94
|
-
cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should
|
95
|
-
cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should
|
92
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_falsey
|
93
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_falsey
|
94
|
+
cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_truthy
|
95
|
+
cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_truthy
|
96
96
|
end
|
97
97
|
it "should allow multiple country tlds with wildcards" do
|
98
98
|
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
|
99
|
-
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should
|
100
|
-
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should
|
101
|
-
cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should
|
102
|
-
cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should
|
103
|
-
cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should
|
104
|
-
cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should
|
105
|
-
cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should
|
106
|
-
cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should
|
99
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
|
100
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_truthy
|
101
|
+
cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_truthy
|
102
|
+
cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_truthy
|
103
|
+
cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_truthy
|
104
|
+
cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_truthy
|
105
|
+
cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_truthy
|
106
|
+
cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_truthy
|
107
107
|
end
|
108
108
|
end
|
109
109
|
|
data/spec/cobweb/robots_spec.rb
CHANGED
@@ -19,16 +19,17 @@ describe Robots do
|
|
19
19
|
|
20
20
|
it "should allow urls marked as allow" do
|
21
21
|
robot = Robots.new(@options)
|
22
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf.html")
|
22
|
+
puts robot.allowed?("http://localhost/globalmarketfinder/asdf.html")
|
23
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_truthy
|
23
24
|
end
|
24
25
|
it "should disallow urls specified as disallow" do
|
25
26
|
robot = Robots.new(@options)
|
26
|
-
robot.allowed?("http://localhost/globalmarketfinder/").should
|
27
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf").should
|
27
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_falsey
|
28
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_falsey
|
28
29
|
end
|
29
30
|
it "should allow urls not listed" do
|
30
31
|
robot = Robots.new(@options)
|
31
|
-
robot.allowed?("http://localhost/notlistedinrobotsfile").should
|
32
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_truthy
|
32
33
|
end
|
33
34
|
|
34
35
|
end
|
@@ -43,10 +44,10 @@ describe Robots do
|
|
43
44
|
|
44
45
|
it "should disallow all urls" do
|
45
46
|
robot = Robots.new(@options)
|
46
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should
|
47
|
-
robot.allowed?("http://localhost/globalmarketfinder/").should
|
48
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf").should
|
49
|
-
robot.allowed?("http://localhost/notlistedinrobotsfile").should
|
47
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_falsey
|
48
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_falsey
|
49
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_falsey
|
50
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_falsey
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -61,10 +62,10 @@ describe Robots do
|
|
61
62
|
|
62
63
|
it "should disallow all urls" do
|
63
64
|
robot = Robots.new(@options)
|
64
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should
|
65
|
-
robot.allowed?("http://localhost/globalmarketfinder/").should
|
66
|
-
robot.allowed?("http://localhost/globalmarketfinder/asdf").should
|
67
|
-
robot.allowed?("http://localhost/notlistedinrobotsfile").should
|
65
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_truthy
|
66
|
+
robot.allowed?("http://localhost/globalmarketfinder/").should be_truthy
|
67
|
+
robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_truthy
|
68
|
+
robot.allowed?("http://localhost/notlistedinrobotsfile").should be_truthy
|
68
69
|
end
|
69
70
|
|
70
71
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - ">="
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 1.
|
103
|
+
version: 1.6.0
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 1.
|
110
|
+
version: 1.6.0
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: json
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -614,7 +614,7 @@ homepage: http://github.com/stewartmckee/cobweb
|
|
614
614
|
licenses:
|
615
615
|
- MIT
|
616
616
|
metadata: {}
|
617
|
-
post_install_message:
|
617
|
+
post_install_message:
|
618
618
|
rdoc_options: []
|
619
619
|
require_paths:
|
620
620
|
- lib
|
@@ -629,9 +629,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
629
629
|
- !ruby/object:Gem::Version
|
630
630
|
version: '0'
|
631
631
|
requirements: []
|
632
|
-
rubyforge_project:
|
632
|
+
rubyforge_project:
|
633
633
|
rubygems_version: 2.7.7
|
634
|
-
signing_key:
|
634
|
+
signing_key:
|
635
635
|
specification_version: 4
|
636
636
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
637
637
|
crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
|