webrobots 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.travis.yml +15 -0
- data/Gemfile +2 -15
- data/LICENSE.txt +1 -1
- data/README.rdoc +1 -1
- data/Rakefile +10 -43
- data/lib/webrobots.rb +38 -2
- data/lib/webrobots/robotstxt.rb +36 -25
- data/lib/webrobots/robotstxt.ry +19 -8
- data/lib/webrobots/version.rb +3 -0
- data/test/test_webrobots.rb +63 -0
- data/webrobots.gemspec +23 -58
- metadata +111 -121
- data/Gemfile.lock +0 -24
- data/VERSION +0 -1
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,17 +1,4 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
|
-
#
|
7
|
-
|
8
|
-
group :development do
|
9
|
-
gem "racc", ">= 0"
|
10
|
-
gem "shoulda", ">= 0"
|
11
|
-
gem "bundler", ">= 1.0.0"
|
12
|
-
gem "jeweler", "~> 1.6.4"
|
13
|
-
gem "rcov", "~> 0.9.11"
|
14
|
-
|
15
|
-
# To test the webrobots/nokogiri module.
|
16
|
-
gem "nokogiri", ">= 1.4.4"
|
17
|
-
end
|
3
|
+
# Specify your gem's dependencies in webrobots.gemspec
|
4
|
+
gemspec
|
data/LICENSE.txt
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,45 +1,22 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
3
|
+
require 'bundler/gem_tasks'
|
13
4
|
|
14
|
-
|
15
|
-
Jeweler::Tasks.new do |gem|
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "webrobots"
|
18
|
-
gem.homepage = "https://github.com/knu/webrobots"
|
19
|
-
gem.license = "2-clause BSDL"
|
20
|
-
gem.summary = %Q{A Ruby library to help write robots.txt compliant web robots}
|
21
|
-
gem.description = <<-'EOS'
|
22
|
-
This library helps write robots.txt compliant web robots in Ruby.
|
23
|
-
EOS
|
24
|
-
gem.email = "knu@idaemons.org"
|
25
|
-
gem.authors = ["Akinori MUSHA"]
|
26
|
-
# dependencies defined in Gemfile
|
27
|
-
end
|
28
|
-
Jeweler::RubygemsDotOrgTasks.new
|
5
|
+
gemspec = Bundler::GemHelper.gemspec
|
29
6
|
|
30
7
|
require 'rake/testtask'
|
31
8
|
Rake::TestTask.new(:test) do |test|
|
32
|
-
test.libs << '
|
33
|
-
test.
|
9
|
+
test.libs << 'test'
|
10
|
+
test.test_files = gemspec.test_files
|
34
11
|
test.verbose = true
|
35
12
|
end
|
36
13
|
|
37
|
-
require '
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
14
|
+
require 'rdoc/task'
|
15
|
+
Rake::RDocTask.new do |rdoc|
|
16
|
+
rdoc.rdoc_dir = 'rdoc'
|
17
|
+
rdoc.title = "#{gemspec.name} #{gemspec.version}"
|
18
|
+
rdoc.rdoc_files.include(gemspec.extra_rdoc_files)
|
19
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
43
20
|
end
|
44
21
|
|
45
22
|
task :default => :test
|
@@ -49,13 +26,3 @@ task :test => 'lib/webrobots/robotstxt.rb'
|
|
49
26
|
file 'lib/webrobots/robotstxt.rb' => 'lib/webrobots/robotstxt.ry' do
|
50
27
|
sh 'racc', '-o', 'lib/webrobots/robotstxt.rb', 'lib/webrobots/robotstxt.ry'
|
51
28
|
end
|
52
|
-
|
53
|
-
require 'rake/rdoctask'
|
54
|
-
Rake::RDocTask.new do |rdoc|
|
55
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
-
|
57
|
-
rdoc.rdoc_dir = 'rdoc'
|
58
|
-
rdoc.title = "webrobots #{version}"
|
59
|
-
rdoc.rdoc_files.include('README*')
|
60
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
-
end
|
data/lib/webrobots.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'webrobots/version'
|
1
2
|
require 'webrobots/robotstxt'
|
2
3
|
require 'uri'
|
3
4
|
require 'net/https'
|
@@ -17,13 +18,34 @@ class WebRobots
|
|
17
18
|
# the response body if successful, return an empty string if the
|
18
19
|
# resource is not found, and return nil or raise any error on
|
19
20
|
# failure. Redirects should be handled within this proc.
|
21
|
+
#
|
22
|
+
# * :crawl_delay => determines how to react to Crawl-delay
|
23
|
+
# directives. If +:sleep+ is given, WebRobots sleeps as demanded
|
24
|
+
# when allowed?(url)/disallowed?(url) is called. This is the
|
25
|
+
# default behavior. If +:ignore+ is given, WebRobots does
|
26
|
+
# nothing. If a custom method, proc, or anything that responds to
|
27
|
+
# .call(delay, last_checked_at), it is called.
|
20
28
|
def initialize(user_agent, options = nil)
|
21
29
|
@user_agent = user_agent
|
22
|
-
@parser = RobotsTxt::Parser.new(user_agent)
|
23
|
-
@parser_mutex = Mutex.new
|
24
30
|
|
25
31
|
options ||= {}
|
26
32
|
@http_get = options[:http_get] || method(:http_get)
|
33
|
+
crawl_delay_handler =
|
34
|
+
case value = options[:crawl_delay] || :sleep
|
35
|
+
when :ignore
|
36
|
+
nil
|
37
|
+
when :sleep
|
38
|
+
method(:crawl_delay_handler)
|
39
|
+
else
|
40
|
+
if value.respond_to?(:call)
|
41
|
+
value
|
42
|
+
else
|
43
|
+
raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
@parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
|
48
|
+
@parser_mutex = Mutex.new
|
27
49
|
|
28
50
|
@robotstxt = create_cache()
|
29
51
|
end
|
@@ -57,6 +79,13 @@ class WebRobots
|
|
57
79
|
!allowed?(url)
|
58
80
|
end
|
59
81
|
|
82
|
+
# Returns the number of seconds that the configured agent should wait
|
83
|
+
# between successive requests to the site identified by +url+ according
|
84
|
+
# to the site's robots.txt +Crawl-delay+ directive.
|
85
|
+
def crawl_delay(url)
|
86
|
+
robots_txt_for(url).crawl_delay()
|
87
|
+
end
|
88
|
+
|
60
89
|
# Returns extended option values for a resource at +url+ in a hash
|
61
90
|
# with each field name lower-cased. See allowed?() for a list of
|
62
91
|
# errors that may be raised.
|
@@ -169,4 +198,11 @@ class WebRobots
|
|
169
198
|
}
|
170
199
|
raise 'too many HTTP redirects'
|
171
200
|
end
|
201
|
+
|
202
|
+
def crawl_delay_handler(delay, last_checked_at)
|
203
|
+
if last_checked_at
|
204
|
+
delay -= Time.now - last_checked_at
|
205
|
+
sleep delay if delay > 0
|
206
|
+
end
|
207
|
+
end
|
172
208
|
end
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.4.
|
3
|
+
# This file is automatically generated by Racc 1.4.9
|
4
4
|
# from Racc grammer file "".
|
5
5
|
#
|
6
6
|
|
@@ -30,17 +30,21 @@ class WebRobots
|
|
30
30
|
class RobotsTxt
|
31
31
|
class Parser < Racc::Parser
|
32
32
|
|
33
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
33
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 171)
|
34
34
|
|
35
|
-
def initialize(target = nil)
|
35
|
+
def initialize(target, crawl_delay_handler = nil)
|
36
36
|
super()
|
37
37
|
@target = target
|
38
|
+
@crawl_delay_handler = crawl_delay_handler
|
38
39
|
end
|
39
40
|
|
40
41
|
def parse!(input, site)
|
41
42
|
parse(input, site)
|
42
43
|
rescue Error => e
|
43
|
-
RobotsTxt.new(site, nil,
|
44
|
+
RobotsTxt.new(site, nil,
|
45
|
+
:error => e,
|
46
|
+
:target => @target,
|
47
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
44
48
|
end
|
45
49
|
|
46
50
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -334,7 +338,9 @@ module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
|
334
338
|
def _reduce_2(val, _values, result)
|
335
339
|
body = val[2]
|
336
340
|
result = RobotsTxt.new(@site, body,
|
337
|
-
:target => @target,
|
341
|
+
:target => @target,
|
342
|
+
:sitemaps => @sitemaps,
|
343
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
338
344
|
|
339
345
|
result
|
340
346
|
end
|
@@ -368,7 +374,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
|
368
374
|
|
369
375
|
# reduce 16 omitted
|
370
376
|
|
371
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
377
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 44)
|
372
378
|
def _reduce_17(val, _values, result)
|
373
379
|
@sitemaps << val[3]
|
374
380
|
|
@@ -376,7 +382,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 42)
|
|
376
382
|
end
|
377
383
|
.,.,
|
378
384
|
|
379
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
385
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 49)
|
380
386
|
def _reduce_18(val, _values, result)
|
381
387
|
result = []
|
382
388
|
result << val[0]
|
@@ -385,7 +391,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 47)
|
|
385
391
|
end
|
386
392
|
.,.,
|
387
393
|
|
388
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
394
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 54)
|
389
395
|
def _reduce_19(val, _values, result)
|
390
396
|
result = []
|
391
397
|
|
@@ -393,7 +399,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 52)
|
|
393
399
|
end
|
394
400
|
.,.,
|
395
401
|
|
396
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
402
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 60)
|
397
403
|
def _reduce_20(val, _values, result)
|
398
404
|
result << val[2]
|
399
405
|
|
@@ -401,7 +407,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 58)
|
|
401
407
|
end
|
402
408
|
.,.,
|
403
409
|
|
404
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
410
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 66)
|
405
411
|
def _reduce_21(val, _values, result)
|
406
412
|
val[2].each_with_index { |line, i|
|
407
413
|
warn "%s line %d: %s: orphan rule line" %
|
@@ -416,7 +422,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 64)
|
|
416
422
|
|
417
423
|
# reduce 23 omitted
|
418
424
|
|
419
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
425
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 81)
|
420
426
|
def _reduce_24(val, _values, result)
|
421
427
|
result = Record.new(val[1], val[2])
|
422
428
|
|
@@ -424,7 +430,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 79)
|
|
424
430
|
end
|
425
431
|
.,.,
|
426
432
|
|
427
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
433
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 86)
|
428
434
|
def _reduce_25(val, _values, result)
|
429
435
|
result = [val[0]]
|
430
436
|
|
@@ -432,7 +438,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 84)
|
|
432
438
|
end
|
433
439
|
.,.,
|
434
440
|
|
435
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
441
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 91)
|
436
442
|
def _reduce_26(val, _values, result)
|
437
443
|
result << val[1]
|
438
444
|
|
@@ -442,7 +448,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 89)
|
|
442
448
|
|
443
449
|
# reduce 27 omitted
|
444
450
|
|
445
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
451
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 98)
|
446
452
|
def _reduce_28(val, _values, result)
|
447
453
|
result = AgentLine.new(val[0], val[3])
|
448
454
|
|
@@ -454,7 +460,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 96)
|
|
454
460
|
|
455
461
|
# reduce 30 omitted
|
456
462
|
|
457
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
463
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 106)
|
458
464
|
def _reduce_31(val, _values, result)
|
459
465
|
result = [result]
|
460
466
|
@rulelinenos = []
|
@@ -463,7 +469,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 104)
|
|
463
469
|
end
|
464
470
|
.,.,
|
465
471
|
|
466
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
472
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 112)
|
467
473
|
def _reduce_32(val, _values, result)
|
468
474
|
result << val[1]
|
469
475
|
@rulelinenos << @lineno
|
@@ -482,7 +488,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 110)
|
|
482
488
|
|
483
489
|
# reduce 37 omitted
|
484
490
|
|
485
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
491
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 125)
|
486
492
|
def _reduce_38(val, _values, result)
|
487
493
|
result = AllowLine.new(val[0], val[3])
|
488
494
|
|
@@ -490,7 +496,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 123)
|
|
490
496
|
end
|
491
497
|
.,.,
|
492
498
|
|
493
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
499
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 130)
|
494
500
|
def _reduce_39(val, _values, result)
|
495
501
|
result = DisallowLine.new(val[0], val[3])
|
496
502
|
|
@@ -498,7 +504,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 128)
|
|
498
504
|
end
|
499
505
|
.,.,
|
500
506
|
|
501
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
507
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 135)
|
502
508
|
def _reduce_40(val, _values, result)
|
503
509
|
result = CrawlDelayLine.new(val[0], val[3])
|
504
510
|
|
@@ -506,7 +512,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 133)
|
|
506
512
|
end
|
507
513
|
.,.,
|
508
514
|
|
509
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
515
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 140)
|
510
516
|
def _reduce_41(val, _values, result)
|
511
517
|
result = ExtentionLine.new(val[0], val[3])
|
512
518
|
|
@@ -528,11 +534,12 @@ end # class Parser
|
|
528
534
|
@timestamp = Time.now
|
529
535
|
@site = site
|
530
536
|
@options = options || {}
|
531
|
-
@
|
537
|
+
@last_checked_at = nil
|
532
538
|
|
533
539
|
@error = @options[:error]
|
534
540
|
@target = @options[:target]
|
535
541
|
@sitemaps = @options[:sitemaps] || []
|
542
|
+
@crawl_delay_handler = @options[:crawl_delay_handler]
|
536
543
|
|
537
544
|
if records && !records.empty?
|
538
545
|
@records, defaults = [], []
|
@@ -578,14 +585,18 @@ end # class Parser
|
|
578
585
|
def allow?(request_uri, user_agent = nil)
|
579
586
|
record = find_record(user_agent) or return true
|
580
587
|
allow = record.allow?(request_uri)
|
581
|
-
if
|
582
|
-
delay
|
583
|
-
sleep delay if delay > 0
|
588
|
+
if delay = record.delay and @crawl_delay_handler
|
589
|
+
@crawl_delay_handler.call(delay, @last_checked_at)
|
584
590
|
end
|
585
|
-
@
|
591
|
+
@last_checked_at = Time.now
|
586
592
|
return allow
|
587
593
|
end
|
588
594
|
|
595
|
+
def crawl_delay(user_agent = nil)
|
596
|
+
record = find_record(user_agent) or return 0
|
597
|
+
record.delay or return 0
|
598
|
+
end
|
599
|
+
|
589
600
|
def options(user_agent = nil)
|
590
601
|
record = find_record(user_agent) or return {}
|
591
602
|
record.options
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -11,7 +11,9 @@ rule
|
|
11
11
|
{
|
12
12
|
body = val[2]
|
13
13
|
result = RobotsTxt.new(@site, body,
|
14
|
-
:target => @target,
|
14
|
+
:target => @target,
|
15
|
+
:sitemaps => @sitemaps,
|
16
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
15
17
|
}
|
16
18
|
|
17
19
|
body :
|
@@ -167,15 +169,19 @@ class WebRobots
|
|
167
169
|
class RobotsTxt
|
168
170
|
---- inner
|
169
171
|
|
170
|
-
def initialize(target = nil)
|
172
|
+
def initialize(target, crawl_delay_handler = nil)
|
171
173
|
super()
|
172
174
|
@target = target
|
175
|
+
@crawl_delay_handler = crawl_delay_handler
|
173
176
|
end
|
174
177
|
|
175
178
|
def parse!(input, site)
|
176
179
|
parse(input, site)
|
177
180
|
rescue Error => e
|
178
|
-
RobotsTxt.new(site, nil,
|
181
|
+
RobotsTxt.new(site, nil,
|
182
|
+
:error => e,
|
183
|
+
:target => @target,
|
184
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
179
185
|
end
|
180
186
|
|
181
187
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -263,11 +269,12 @@ class WebRobots
|
|
263
269
|
@timestamp = Time.now
|
264
270
|
@site = site
|
265
271
|
@options = options || {}
|
266
|
-
@
|
272
|
+
@last_checked_at = nil
|
267
273
|
|
268
274
|
@error = @options[:error]
|
269
275
|
@target = @options[:target]
|
270
276
|
@sitemaps = @options[:sitemaps] || []
|
277
|
+
@crawl_delay_handler = @options[:crawl_delay_handler]
|
271
278
|
|
272
279
|
if records && !records.empty?
|
273
280
|
@records, defaults = [], []
|
@@ -313,14 +320,18 @@ class WebRobots
|
|
313
320
|
def allow?(request_uri, user_agent = nil)
|
314
321
|
record = find_record(user_agent) or return true
|
315
322
|
allow = record.allow?(request_uri)
|
316
|
-
if
|
317
|
-
delay
|
318
|
-
sleep delay if delay > 0
|
323
|
+
if delay = record.delay and @crawl_delay_handler
|
324
|
+
@crawl_delay_handler.call(delay, @last_checked_at)
|
319
325
|
end
|
320
|
-
@
|
326
|
+
@last_checked_at = Time.now
|
321
327
|
return allow
|
322
328
|
end
|
323
329
|
|
330
|
+
def crawl_delay(user_agent = nil)
|
331
|
+
record = find_record(user_agent) or return 0
|
332
|
+
record.delay or return 0
|
333
|
+
end
|
334
|
+
|
324
335
|
def options(user_agent = nil)
|
325
336
|
record = find_record(user_agent) or return {}
|
326
337
|
record.options
|
data/test/test_webrobots.rb
CHANGED
@@ -384,6 +384,12 @@ Option1: Foo
|
|
384
384
|
Option2: Hello
|
385
385
|
Crawl-Delay: 1.5
|
386
386
|
|
387
|
+
User-Agent: HerBot
|
388
|
+
Disallow: /2heavy/
|
389
|
+
Allow: /2heavy/*.html
|
390
|
+
Option1: Baz
|
391
|
+
Option2: Qux
|
392
|
+
|
387
393
|
User-Agent: *
|
388
394
|
Disallow: /2heavy/
|
389
395
|
Allow: /2heavy/*.html
|
@@ -400,6 +406,9 @@ Option3: Hi
|
|
400
406
|
}
|
401
407
|
|
402
408
|
@robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
|
409
|
+
@robots_mybot_ignore = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => :ignore)
|
410
|
+
@robots_mybot_custom = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => proc { |*args| @delay_args = args })
|
411
|
+
@robots_herbot = WebRobots.new('HerBot', :http_get => http_get)
|
403
412
|
@robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
|
404
413
|
end
|
405
414
|
|
@@ -411,6 +420,27 @@ Option3: Hi
|
|
411
420
|
assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
|
412
421
|
assert_equal 'Hello', options['option2']
|
413
422
|
|
423
|
+
options = @robots_mybot_ignore.options('http://www.example.org/')
|
424
|
+
assert_equal 2, options.size
|
425
|
+
assert_equal 'Foo', @robots_mybot_ignore.option('http://www.example.org/', 'Option1')
|
426
|
+
assert_equal 'Foo', options['option1']
|
427
|
+
assert_equal 'Hello', @robots_mybot_ignore.option('http://www.example.org/', 'Option2')
|
428
|
+
assert_equal 'Hello', options['option2']
|
429
|
+
|
430
|
+
options = @robots_mybot_custom.options('http://www.example.org/')
|
431
|
+
assert_equal 2, options.size
|
432
|
+
assert_equal 'Foo', @robots_mybot_custom.option('http://www.example.org/', 'Option1')
|
433
|
+
assert_equal 'Foo', options['option1']
|
434
|
+
assert_equal 'Hello', @robots_mybot_custom.option('http://www.example.org/', 'Option2')
|
435
|
+
assert_equal 'Hello', options['option2']
|
436
|
+
|
437
|
+
options = @robots_herbot.options('http://www.example.org/')
|
438
|
+
assert_equal 2, options.size
|
439
|
+
assert_equal 'Baz', @robots_herbot.option('http://www.example.org/', 'Option1')
|
440
|
+
assert_equal 'Baz', options['option1']
|
441
|
+
assert_equal 'Qux', @robots_herbot.option('http://www.example.org/', 'Option2')
|
442
|
+
assert_equal 'Qux', options['option2']
|
443
|
+
|
414
444
|
options = @robots_hisbot.options('http://www.example.org/')
|
415
445
|
assert_equal 2, options.size
|
416
446
|
assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1')
|
@@ -422,11 +452,25 @@ Option3: Hi
|
|
422
452
|
http://www.example.org/sitemap-host1.xml
|
423
453
|
http://www.example.org/sitemap-host2.xml
|
424
454
|
], @robots_mybot.sitemaps('http://www.example.org/')
|
455
|
+
assert_equal %w[
|
456
|
+
http://www.example.org/sitemap-host1.xml
|
457
|
+
http://www.example.org/sitemap-host2.xml
|
458
|
+
], @robots_mybot_ignore.sitemaps('http://www.example.org/')
|
459
|
+
assert_equal %w[
|
460
|
+
http://www.example.org/sitemap-host1.xml
|
461
|
+
http://www.example.org/sitemap-host2.xml
|
462
|
+
], @robots_herbot.sitemaps('http://www.example.org/')
|
425
463
|
assert_equal %w[
|
426
464
|
http://www.example.org/sitemap-host1.xml
|
427
465
|
http://www.example.org/sitemap-host2.xml
|
428
466
|
], @robots_hisbot.sitemaps('http://www.example.org/')
|
429
467
|
|
468
|
+
assert_equal 1.5, @robots_mybot.crawl_delay('http://www.example.org/')
|
469
|
+
assert_equal 1.5, @robots_mybot_ignore.crawl_delay('http://www.example.org/')
|
470
|
+
assert_equal 1.5, @robots_mybot_custom.crawl_delay('http://www.example.org/')
|
471
|
+
assert_equal 0, @robots_herbot.crawl_delay('http://www.example.org/')
|
472
|
+
assert_equal 0, @robots_hisbot.crawl_delay('http://www.example.org/')
|
473
|
+
|
430
474
|
t1 = Time.now
|
431
475
|
@robots_mybot.allowed?('http://www.example.org/')
|
432
476
|
@robots_mybot.allowed?('http://www.example.org/article1.html')
|
@@ -435,6 +479,25 @@ Option3: Hi
|
|
435
479
|
@robots_mybot.allowed?('http://www.example.org/article2.html')
|
436
480
|
t3 = Time.now
|
437
481
|
assert_in_delta 1.5, t3 - t2, 0.1
|
482
|
+
|
483
|
+
t1 = Time.now
|
484
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/')
|
485
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/article1.html')
|
486
|
+
t2 = Time.now
|
487
|
+
assert_in_delta 0, t2 - t1, 0.1
|
488
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/article2.html')
|
489
|
+
t3 = Time.now
|
490
|
+
assert_in_delta 0, t3 - t2, 0.1
|
491
|
+
|
492
|
+
t1 = Time.now
|
493
|
+
@robots_mybot_custom.allowed?('http://www.example.org/')
|
494
|
+
@robots_mybot_custom.allowed?('http://www.example.org/article1.html')
|
495
|
+
t2 = Time.now
|
496
|
+
assert_in_delta 0, t2 - t1, 0.1
|
497
|
+
assert_instance_of Array, @delay_args
|
498
|
+
assert_equal 2, @delay_args.size
|
499
|
+
assert_equal 1.5, @delay_args[0]
|
500
|
+
assert_instance_of Time, @delay_args[1]
|
438
501
|
end
|
439
502
|
end
|
440
503
|
|
data/webrobots.gemspec
CHANGED
@@ -1,68 +1,33 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "webrobots/version"
|
5
4
|
|
6
5
|
Gem::Specification.new do |s|
|
7
|
-
s.name
|
8
|
-
s.version
|
6
|
+
s.name = "webrobots"
|
7
|
+
s.version = Webrobots::VERSION
|
8
|
+
s.authors = ["Akinori MUSHA"]
|
9
|
+
s.email = ["knu@idaemons.org"]
|
10
|
+
s.homepage = %q{https://github.com/knu/webrobots}
|
11
|
+
s.licenses = [%q{2-clause BSDL}]
|
12
|
+
s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
|
13
|
+
s.description = <<-'EOS'
|
14
|
+
This library helps write robots.txt compliant web robots in Ruby.
|
15
|
+
EOS
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = s.files.grep(%r{/test_[^/]+\.rb$})
|
19
|
+
s.executables = s.files.grep(%r{^bin/[^.]}).map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
9
21
|
|
10
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = "2012-01-24"
|
13
|
-
s.description = "This library helps write robots.txt compliant web robots in Ruby.\n"
|
14
|
-
s.email = "knu@idaemons.org"
|
15
22
|
s.extra_rdoc_files = [
|
16
23
|
"LICENSE.txt",
|
17
24
|
"README.rdoc"
|
18
25
|
]
|
19
|
-
s.files = [
|
20
|
-
".document",
|
21
|
-
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
|
-
"LICENSE.txt",
|
24
|
-
"README.rdoc",
|
25
|
-
"Rakefile",
|
26
|
-
"VERSION",
|
27
|
-
"lib/webrobots.rb",
|
28
|
-
"lib/webrobots/nokogiri.rb",
|
29
|
-
"lib/webrobots/robotstxt.rb",
|
30
|
-
"lib/webrobots/robotstxt.ry",
|
31
|
-
"test/helper.rb",
|
32
|
-
"test/test_webrobots.rb",
|
33
|
-
"webrobots.gemspec"
|
34
|
-
]
|
35
|
-
s.homepage = "https://github.com/knu/webrobots"
|
36
|
-
s.licenses = ["2-clause BSDL"]
|
37
|
-
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = "1.8.15"
|
39
|
-
s.summary = "A Ruby library to help write robots.txt compliant web robots"
|
40
|
-
|
41
|
-
if s.respond_to? :specification_version then
|
42
|
-
s.specification_version = 3
|
43
26
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
s.add_development_dependency(%q<nokogiri>, [">= 1.4.4"])
|
51
|
-
else
|
52
|
-
s.add_dependency(%q<racc>, [">= 0"])
|
53
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
54
|
-
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
55
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
56
|
-
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
57
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
58
|
-
end
|
59
|
-
else
|
60
|
-
s.add_dependency(%q<racc>, [">= 0"])
|
61
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
62
|
-
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
63
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
64
|
-
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
65
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
66
|
-
end
|
27
|
+
s.add_development_dependency("rake", [">= 0.9.2.2"])
|
28
|
+
s.add_development_dependency("racc", [">= 0"]) unless RUBY_PLATFORM == "java"
|
29
|
+
s.add_development_dependency("shoulda", [">= 0"])
|
30
|
+
s.add_development_dependency("rdoc", ["> 2.4.2"])
|
31
|
+
s.add_development_dependency("bundler", [">= 1.2"])
|
32
|
+
s.add_development_dependency("nokogiri", [">= 1.4.4"])
|
67
33
|
end
|
68
|
-
|
metadata
CHANGED
@@ -1,172 +1,162 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 13
|
10
|
-
version: 0.0.13
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Akinori MUSHA
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
requirement:
|
12
|
+
date: 2013-02-15 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
22
17
|
none: false
|
23
|
-
requirements:
|
24
|
-
- -
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
|
27
|
-
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
version_requirements: *id001
|
31
|
-
name: racc
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.9.2.2
|
22
|
+
type: :development
|
32
23
|
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.9.2.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: racc
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
33
38
|
type: :development
|
34
|
-
|
35
|
-
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
41
|
none: false
|
37
|
-
requirements:
|
38
|
-
- -
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
|
41
|
-
|
42
|
-
- 0
|
43
|
-
version: "0"
|
44
|
-
version_requirements: *id002
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
45
47
|
name: shoulda
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
46
55
|
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rdoc
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>'
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 2.4.2
|
47
70
|
type: :development
|
48
|
-
|
49
|
-
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
73
|
none: false
|
51
|
-
requirements:
|
52
|
-
- -
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
|
55
|
-
|
56
|
-
- 1
|
57
|
-
- 0
|
58
|
-
- 0
|
59
|
-
version: 1.0.0
|
60
|
-
version_requirements: *id003
|
74
|
+
requirements:
|
75
|
+
- - ! '>'
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 2.4.2
|
78
|
+
- !ruby/object:Gem::Dependency
|
61
79
|
name: bundler
|
62
|
-
|
63
|
-
type: :development
|
64
|
-
- !ruby/object:Gem::Dependency
|
65
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
66
81
|
none: false
|
67
|
-
requirements:
|
68
|
-
- -
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
|
71
|
-
segments:
|
72
|
-
- 1
|
73
|
-
- 6
|
74
|
-
- 4
|
75
|
-
version: 1.6.4
|
76
|
-
version_requirements: *id004
|
77
|
-
name: jeweler
|
78
|
-
prerelease: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '1.2'
|
79
86
|
type: :development
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
requirement: &id005 !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
|
-
requirements:
|
84
|
-
- - ~>
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
hash: 45
|
87
|
-
segments:
|
88
|
-
- 0
|
89
|
-
- 9
|
90
|
-
- 11
|
91
|
-
version: 0.9.11
|
92
|
-
version_requirements: *id005
|
93
|
-
name: rcov
|
94
87
|
prerelease: false
|
95
|
-
|
96
|
-
- !ruby/object:Gem::Dependency
|
97
|
-
requirement: &id006 !ruby/object:Gem::Requirement
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
98
89
|
none: false
|
99
|
-
requirements:
|
100
|
-
- -
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
|
103
|
-
|
104
|
-
- 1
|
105
|
-
- 4
|
106
|
-
- 4
|
107
|
-
version: 1.4.4
|
108
|
-
version_requirements: *id006
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '1.2'
|
94
|
+
- !ruby/object:Gem::Dependency
|
109
95
|
name: nokogiri
|
110
|
-
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.4.4
|
111
102
|
type: :development
|
112
|
-
|
113
|
-
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.4.4
|
110
|
+
description: ! 'This library helps write robots.txt compliant web robots in Ruby.
|
114
111
|
|
115
|
-
|
112
|
+
'
|
113
|
+
email:
|
114
|
+
- knu@idaemons.org
|
116
115
|
executables: []
|
117
|
-
|
118
116
|
extensions: []
|
119
|
-
|
120
|
-
extra_rdoc_files:
|
117
|
+
extra_rdoc_files:
|
121
118
|
- LICENSE.txt
|
122
119
|
- README.rdoc
|
123
|
-
files:
|
120
|
+
files:
|
124
121
|
- .document
|
122
|
+
- .gitignore
|
123
|
+
- .travis.yml
|
125
124
|
- Gemfile
|
126
|
-
- Gemfile.lock
|
127
125
|
- LICENSE.txt
|
128
126
|
- README.rdoc
|
129
127
|
- Rakefile
|
130
|
-
- VERSION
|
131
128
|
- lib/webrobots.rb
|
132
129
|
- lib/webrobots/nokogiri.rb
|
133
130
|
- lib/webrobots/robotstxt.rb
|
134
131
|
- lib/webrobots/robotstxt.ry
|
132
|
+
- lib/webrobots/version.rb
|
135
133
|
- test/helper.rb
|
136
134
|
- test/test_webrobots.rb
|
137
135
|
- webrobots.gemspec
|
138
136
|
homepage: https://github.com/knu/webrobots
|
139
|
-
licenses:
|
137
|
+
licenses:
|
140
138
|
- 2-clause BSDL
|
141
139
|
post_install_message:
|
142
140
|
rdoc_options: []
|
143
|
-
|
144
|
-
require_paths:
|
141
|
+
require_paths:
|
145
142
|
- lib
|
146
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
143
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
147
144
|
none: false
|
148
|
-
requirements:
|
149
|
-
- -
|
150
|
-
- !ruby/object:Gem::Version
|
151
|
-
|
152
|
-
|
153
|
-
- 0
|
154
|
-
version: "0"
|
155
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
|
+
requirements:
|
146
|
+
- - ! '>='
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
149
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
156
150
|
none: false
|
157
|
-
requirements:
|
158
|
-
- -
|
159
|
-
- !ruby/object:Gem::Version
|
160
|
-
|
161
|
-
segments:
|
162
|
-
- 0
|
163
|
-
version: "0"
|
151
|
+
requirements:
|
152
|
+
- - ! '>='
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
164
155
|
requirements: []
|
165
|
-
|
166
156
|
rubyforge_project:
|
167
|
-
rubygems_version: 1.8.
|
157
|
+
rubygems_version: 1.8.24
|
168
158
|
signing_key:
|
169
159
|
specification_version: 3
|
170
160
|
summary: A Ruby library to help write robots.txt compliant web robots
|
171
|
-
test_files:
|
172
|
-
|
161
|
+
test_files:
|
162
|
+
- test/test_webrobots.rb
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
git (1.2.5)
|
5
|
-
jeweler (1.6.4)
|
6
|
-
bundler (~> 1.0)
|
7
|
-
git (>= 1.2.5)
|
8
|
-
rake
|
9
|
-
nokogiri (1.5.0)
|
10
|
-
racc (1.4.7)
|
11
|
-
rake (0.9.2.2)
|
12
|
-
rcov (0.9.11)
|
13
|
-
shoulda (2.11.3)
|
14
|
-
|
15
|
-
PLATFORMS
|
16
|
-
ruby
|
17
|
-
|
18
|
-
DEPENDENCIES
|
19
|
-
bundler (>= 1.0.0)
|
20
|
-
jeweler (~> 1.6.4)
|
21
|
-
nokogiri (>= 1.4.4)
|
22
|
-
racc
|
23
|
-
rcov (~> 0.9.11)
|
24
|
-
shoulda
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.13
|