cobweb 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b21efd1b03e9f4515045bdd69439fc8a7a4bf3f5aa896edebdbccd12d421d51
4
- data.tar.gz: 4744738dfb7eaca5c5b6c1cabdebea8899e0f18cadb5532f1bb43870e98ff4e2
3
+ metadata.gz: 2fa16c36a203417d0f90f884634267227698d83839580c59982d15b01bb1f495
4
+ data.tar.gz: a262f311869ccd9696fd377c1bd2de04ab32f19b1709df1d3321b80e9d4e8517
5
5
  SHA512:
6
- metadata.gz: 55b7f9878fcf6c3d97bd935fa59e61c54d745dd37cc75405e9af0e787d56e6543d79a2dedfca43912ed791fd59b51193b12c58722be9ce8c5d65428b6ea0af48
7
- data.tar.gz: 8f767e79d76787cd84edbecd967e3136615a32723210a1abe149f8c126e4c8e1c6b3608f631f4ea7d3ad4fe66aa39702b18f73e28c0091ab02b2defdb28868d2
6
+ metadata.gz: 1b32dce38f7563f75dce48d4e87dfa083c918b1f3702a2f95159939fb09a30742ea27be5dd3e1551451f268df86cbbf9614603f030cfc114cc421fe17cda7950
7
+ data.tar.gz: dc8dfcdde6e157b73d4ff785d95afbb2d71ff3180169685762683a2b6aeeb37d1b47743608f76e3d2b944f37b12a3cab0cfd0a4bc1bb05c902d447b0f648709e
@@ -189,6 +189,8 @@ bc. crawl = CobwebCrawlHelper.new(options)
189
189
 
190
190
  h2. Contributing/Testing
191
191
 
192
+ <p>Firstly, you could <a href="https://www.buymeacoffee.com/Z2yRGl3CX" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: auto !important;width: auto !important;" ></a></p>
193
+
192
194
  Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
193
195
 
194
196
  Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
@@ -77,8 +77,17 @@ class Cobweb
77
77
  if @options[:internal_urls].nil? || @options[:internal_urls].empty?
78
78
  uri = Addressable::URI.parse(base_url)
79
79
  @options[:internal_urls] = []
80
- @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
81
- @options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
80
+
81
+ if @options[:treat_https_as_http_to]
82
+ @options[:internal_urls] << ["http://", uri.host, "/*"].join
83
+ @options[:internal_urls] << ["http://", uri.host, ":", uri.inferred_port, "/*"].join
84
+ @options[:internal_urls] << ["https://", uri.host, "/*"].join
85
+ @options[:internal_urls] << ["https://", uri.host, ":", uri.inferred_port, "/*"].join
86
+ else
87
+ @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
88
+ @options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
89
+ end
90
+
82
91
  end
83
92
 
84
93
  request.merge!(@options)
@@ -86,7 +95,10 @@ class Cobweb
86
95
  @redis.set("original_base_url", base_url)
87
96
  @redis.hset "statistics", "queued_at", DateTime.now
88
97
  @redis.set("crawl-counter", 0)
89
- @redis.set("queue-counter", 1)
98
+ queue_counter = @options[:seed_urls].count + 1
99
+ puts "queue_counter being init to #{queue_counter}"
100
+ @redis.set("queue-counter", queue_counter)
101
+
90
102
 
91
103
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
92
104
 
@@ -99,6 +111,12 @@ class Cobweb
99
111
  Resque.enqueue(CrawlJob, request)
100
112
  elsif @options[:queue_system] == :sidekiq
101
113
  CrawlWorker.perform_async(request)
114
+ @options[:seed_urls].map{|url|
115
+ new_request = request.clone
116
+ new_request[:url] = url
117
+ CrawlWorker.perform_async(new_request)
118
+ }
119
+
102
120
  else
103
121
  raise "Unknown queue system: #{content_request[:queue_system]}"
104
122
  end
@@ -170,7 +188,7 @@ class Cobweb
170
188
  begin
171
189
  puts "Retrieving #{uri}... " unless @options[:quiet]
172
190
  request_options={}
173
- request_options['Cookie']= options[:cookies] if options[:cookies]
191
+ request_options['Cookie']= options[:cookies].map{|k,v| [k,v].join("=") }.join("&") if options[:cookies]
174
192
  request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
175
193
 
176
194
  request = Net::HTTP::Get.new uri.request_uri, request_options
@@ -185,6 +203,8 @@ class Cobweb
185
203
 
186
204
  response = @http.request request
187
205
 
206
+ cookies = Hash[get_cookies(response).to_s.split("; ").map{|s| [CGI.unescape(s.split("=")[0]), s.split("=")[1]]}].merge(options[:cookies] || {})
207
+
188
208
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
189
209
 
190
210
  # get location to redirect to
@@ -195,8 +215,7 @@ class Cobweb
195
215
  redirect_limit = redirect_limit - 1
196
216
 
197
217
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
198
- cookies = get_cookies(response)
199
-
218
+
200
219
  # get the content from redirect location
201
220
  content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
202
221
 
@@ -213,6 +232,7 @@ class Cobweb
213
232
  # create the content container
214
233
  content[:url] = uri.to_s
215
234
  content[:status_code] = response.code.to_i
235
+ content[:cookies] = cookies
216
236
  content[:mime_type] = ""
217
237
  content[:mime_type] = response.content_type.split(";")[0].strip unless response.content_type.nil?
218
238
  if !response["Content-Type"].nil? && response["Content-Type"].include?(";")
@@ -28,16 +28,16 @@ class CobwebLinks
28
28
 
29
29
  # Returns true if the link is matched to an internal_url and not matched to an external_url
30
30
  def internal?(link)
31
- !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
31
+ @internal_patterns.any?{|pattern| link.match(pattern)} && !@external_patterns.any?{|pattern| link.match(pattern)}
32
32
  end
33
33
 
34
34
  # Returns true if the link is matched to an external_url or not matched to an internal_url
35
35
  def external?(link)
36
- @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
36
+ !@internal_patterns.any?{|pattern| link.match(pattern)} || @external_patterns.any?{|pattern| link.match(pattern)}
37
37
  end
38
38
 
39
39
  def matches_external?(link)
40
- !@external_patterns.select{|pattern| link.match(pattern)}.empty?
40
+ @external_patterns.any?{|pattern| link.match(pattern)}
41
41
  end
42
42
 
43
43
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.2.0"
6
+ "1.2.1"
7
7
  end
8
8
 
9
9
  end
@@ -70,6 +70,8 @@ module CobwebModule
70
70
  @redis.sadd("currently_running", @options[:url])
71
71
  @stats.update_status("Retrieving #{@options[:url]}...")
72
72
  @content = Cobweb.new(@options).get(@options[:url], @options)
73
+ @cookies = @content[:cookies]
74
+
73
75
  update_counters
74
76
 
75
77
  if @options[:url] == @redis.get("original_base_url")
@@ -117,7 +119,6 @@ module CobwebModule
117
119
  document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
118
120
  #get rid of duplicate links in the same page.
119
121
  document_links.uniq!
120
-
121
122
  # select the link if its internal
122
123
  internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
123
124
 
@@ -125,24 +126,16 @@ module CobwebModule
125
126
  if @options[:treat_https_as_http]
126
127
  internal_links.map!{|link| link.gsub(/^https/, "http")}
127
128
  end
128
-
129
129
  # reject the link if we've crawled it or queued it
130
130
  internal_links.reject! { |link| already_handled?(link)}
131
131
 
132
- lock("internal-links") do
133
- internal_links.each do |link|
134
- if within_queue_limits? && !already_handled?(link)
135
- if status != CobwebCrawlHelper::CANCELLED
136
- yield link if block_given?
137
- unless link.nil?
138
- @redis.sadd "queued", link
139
- increment_queue_counter
140
- end
141
- else
142
- debug_puts "Cannot enqueue new content as crawl has been cancelled."
143
- end
144
- end
132
+ if status != CobwebCrawlHelper::CANCELLED && internal_links.present?
133
+ lock("internal-links") do
134
+ @redis.sadd "queued", internal_links
135
+ @redis.incrby "queue-counter", internal_links.count
136
+
145
137
  end
138
+ internal_links.map{|link| yield(link) if block_given? }
146
139
  end
147
140
 
148
141
  if @options[:store_inbound_links]
@@ -154,6 +147,10 @@ module CobwebModule
154
147
  end
155
148
  end
156
149
 
150
+ def cookies
151
+ @cookies
152
+ end
153
+
157
154
  def content
158
155
  raise "Content is not available" if @content.nil?
159
156
  CobwebModule::CrawlObject.new(@content, @options)
@@ -182,7 +179,7 @@ module CobwebModule
182
179
  end
183
180
 
184
181
  def to_be_processed?
185
- !finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
182
+ !finished? && within_process_limits?
186
183
  end
187
184
 
188
185
  def process(&block)
@@ -275,7 +272,7 @@ module CobwebModule
275
272
  debug_puts "LOCK:#{key}:#{set_nx}"
276
273
  while !set_nx
277
274
  debug_puts "===== WAITING FOR LOCK [#{key}] ====="
278
- sleep 0.01
275
+ sleep 0.5
279
276
  set_nx = @redis.setnx("#{key}_lock", "locked")
280
277
  end
281
278
 
@@ -285,7 +282,7 @@ module CobwebModule
285
282
  result = yield
286
283
  ensure
287
284
  @redis.del("#{key}_lock")
288
- #debug_puts "LOCK RELEASED [#{key}]"
285
+ debug_puts "LOCK RELEASED [#{key}]"
289
286
  end
290
287
  result
291
288
  end
@@ -20,6 +20,8 @@ class CrawlWorker
20
20
  # setup the crawl class to manage the crawl of this object
21
21
  @crawl = CobwebModule::Crawl.new(content_request)
22
22
 
23
+ content_request.merge!(:cookies => @crawl.cookies)
24
+
23
25
  # update the counters and then perform the get, returns false if we are outwith limits
24
26
  if @crawl.retrieve
25
27
 
@@ -28,13 +30,12 @@ class CrawlWorker
28
30
 
29
31
  # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
30
32
  @crawl.process_links do |link|
31
- @crawl.lock("queue_links") do
32
- if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
33
+ puts "Looking to add #{link}.."
34
+ # @crawl.lock("queue_links") do
33
35
  # enqueue the links to sidekiq
34
- @crawl.debug_puts "QUEUED LINK: #{link}"
35
- enqueue_content(content_request, link)
36
- end
37
- end
36
+ @crawl.debug_puts "QUEUED LINK: #{link}"
37
+ enqueue_content(content_request, link)
38
+ # end
38
39
  end
39
40
 
40
41
  if @crawl.to_be_processed?
@@ -28,22 +28,22 @@ describe CobwebModule::Crawl, :local_only => true do
28
28
  end
29
29
  it "should connect to the local redis" do
30
30
  if @remote
31
- @local.redis.exists("test_redis").should be_false
31
+ @local.redis.exists("test_redis").should be_falsey
32
32
  @local.redis.set("test_redis", 1)
33
- @local.redis.exists("test_redis").should be_true
33
+ @local.redis.exists("test_redis").should be_truthy
34
34
 
35
- @remote.redis.exists("test_redis").should be_false
35
+ @remote.redis.exists("test_redis").should be_falsey
36
36
  else
37
37
  puts "WARNING: can't connect to remote redis"
38
38
  end
39
39
  end
40
40
  it "should connect to the remote redis" do
41
41
  if @remote
42
- @remote.redis.exists("test_redis").should be_false
42
+ @remote.redis.exists("test_redis").should be_falsey
43
43
  @remote.redis.set("test_redis", 1)
44
- @remote.redis.exists("test_redis").should be_true
44
+ @remote.redis.exists("test_redis").should be_truthy
45
45
 
46
- @local.redis.exists("test_redis").should be_false
46
+ @local.redis.exists("test_redis").should be_falsey
47
47
  else
48
48
  puts "WARNING: can't connect to remote redis"
49
49
  end
@@ -41,69 +41,69 @@ describe CobwebLinks do
41
41
  describe "internal and external links" do
42
42
  it "should only return internal links" do
43
43
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
44
- cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
45
- cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_true
44
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_truthy
45
+ cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_truthy
46
46
  end
47
47
  it "should not return external links" do
48
48
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
49
- cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
50
- cobweb_links.external?("http://domain_two.com/pageone.html").should be_true
51
- cobweb_links.external?("http://external.com/pageone.html").should be_true
49
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_falsey
50
+ cobweb_links.external?("http://domain_two.com/pageone.html").should be_truthy
51
+ cobweb_links.external?("http://external.com/pageone.html").should be_truthy
52
52
  end
53
53
  it "should override internal links with external links" do
54
54
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
55
- cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
56
- cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
57
- cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_false
58
- cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_true
59
- cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_false
60
- cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_true
55
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_truthy
56
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_falsey
57
+ cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_falsey
58
+ cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_truthy
59
+ cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_falsey
60
+ cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_truthy
61
61
  end
62
62
  end
63
63
  it "should only match from beginning of url" do
64
64
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
65
- cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
66
- cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_false
67
- cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_true
68
- cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_false
65
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
66
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_falsey
67
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_truthy
68
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_falsey
69
69
  end
70
70
 
71
71
  describe "using wildcards" do
72
72
  it "should match internal links with wildcards" do
73
73
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
74
- cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
75
- cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_true
76
- cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
74
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
75
+ cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_truthy
76
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_falsey
77
77
  end
78
78
  it "should match external links with wildcards" do
79
79
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
80
- cobweb_links.external?("http://www.domain_one.com/pageone.html").should be_true
81
- cobweb_links.external?("http://images.domain_one.com/logo.png").should be_true
82
- cobweb_links.external?("http://blog.domain_one.com/pageone.html").should be_true
80
+ cobweb_links.external?("http://www.domain_one.com/pageone.html").should be_truthy
81
+ cobweb_links.external?("http://images.domain_one.com/logo.png").should be_truthy
82
+ cobweb_links.external?("http://blog.domain_one.com/pageone.html").should be_truthy
83
83
  end
84
84
  it "should match external links with querystring parameters" do
85
85
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.ford.com/"], :external_urls => ["http://*.ford.com/*?*view=print"])
86
- cobweb_links.external?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_true
87
- cobweb_links.internal?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_false
88
- cobweb_links.external?("http://corporate.ford.com/news-center/view=print/pr-doug-scott2658-marketing-manager-31039").should be_true
86
+ cobweb_links.external?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_truthy
87
+ cobweb_links.internal?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_falsey
88
+ cobweb_links.external?("http://corporate.ford.com/news-center/view=print/pr-doug-scott2658-marketing-manager-31039").should be_truthy
89
89
  end
90
90
  it "should allow multiple wildcards" do
91
91
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
92
- cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
93
- cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
94
- cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_true
95
- cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_true
92
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_falsey
93
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_falsey
94
+ cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_truthy
95
+ cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_truthy
96
96
  end
97
97
  it "should allow multiple country tlds with wildcards" do
98
98
  cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
99
- cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
100
- cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_true
101
- cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_true
102
- cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_true
103
- cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_true
104
- cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_true
105
- cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_true
106
- cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_true
99
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_truthy
100
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_truthy
101
+ cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_truthy
102
+ cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_truthy
103
+ cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_truthy
104
+ cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_truthy
105
+ cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_truthy
106
+ cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_truthy
107
107
  end
108
108
  end
109
109
 
@@ -19,16 +19,17 @@ describe Robots do
19
19
 
20
20
  it "should allow urls marked as allow" do
21
21
  robot = Robots.new(@options)
22
- robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
22
+ puts robot.allowed?("http://localhost/globalmarketfinder/asdf.html")
23
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_truthy
23
24
  end
24
25
  it "should disallow urls specified as disallow" do
25
26
  robot = Robots.new(@options)
26
- robot.allowed?("http://localhost/globalmarketfinder/").should be_false
27
- robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
27
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_falsey
28
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_falsey
28
29
  end
29
30
  it "should allow urls not listed" do
30
31
  robot = Robots.new(@options)
31
- robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
32
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_truthy
32
33
  end
33
34
 
34
35
  end
@@ -43,10 +44,10 @@ describe Robots do
43
44
 
44
45
  it "should disallow all urls" do
45
46
  robot = Robots.new(@options)
46
- robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_false
47
- robot.allowed?("http://localhost/globalmarketfinder/").should be_false
48
- robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_false
49
- robot.allowed?("http://localhost/notlistedinrobotsfile").should be_false
47
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_falsey
48
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_falsey
49
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_falsey
50
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_falsey
50
51
  end
51
52
 
52
53
  end
@@ -61,10 +62,10 @@ describe Robots do
61
62
 
62
63
  it "should disallow all urls" do
63
64
  robot = Robots.new(@options)
64
- robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_true
65
- robot.allowed?("http://localhost/globalmarketfinder/").should be_true
66
- robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_true
67
- robot.allowed?("http://localhost/notlistedinrobotsfile").should be_true
65
+ robot.allowed?("http://localhost/globalmarketfinder/asdf.html").should be_truthy
66
+ robot.allowed?("http://localhost/globalmarketfinder/").should be_truthy
67
+ robot.allowed?("http://localhost/globalmarketfinder/asdf").should be_truthy
68
+ robot.allowed?("http://localhost/notlistedinrobotsfile").should be_truthy
68
69
  end
69
70
 
70
71
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-25 00:00:00.000000000 Z
11
+ date: 2021-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -100,14 +100,14 @@ dependencies:
100
100
  requirements:
101
101
  - - ">="
102
102
  - !ruby/object:Gem::Version
103
- version: 1.5.2
103
+ version: 1.6.0
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
- version: 1.5.2
110
+ version: 1.6.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: json
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -614,7 +614,7 @@ homepage: http://github.com/stewartmckee/cobweb
614
614
  licenses:
615
615
  - MIT
616
616
  metadata: {}
617
- post_install_message:
617
+ post_install_message:
618
618
  rdoc_options: []
619
619
  require_paths:
620
620
  - lib
@@ -629,9 +629,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
629
629
  - !ruby/object:Gem::Version
630
630
  version: '0'
631
631
  requirements: []
632
- rubyforge_project:
632
+ rubyforge_project:
633
633
  rubygems_version: 2.7.7
634
- signing_key:
634
+ signing_key:
635
635
  specification_version: 4
636
636
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
637
637
  crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone