webrobots 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.travis.yml +15 -0
- data/Gemfile +2 -15
- data/LICENSE.txt +1 -1
- data/README.rdoc +1 -1
- data/Rakefile +10 -43
- data/lib/webrobots.rb +38 -2
- data/lib/webrobots/robotstxt.rb +36 -25
- data/lib/webrobots/robotstxt.ry +19 -8
- data/lib/webrobots/version.rb +3 -0
- data/test/test_webrobots.rb +63 -0
- data/webrobots.gemspec +23 -58
- metadata +111 -121
- data/Gemfile.lock +0 -24
- data/VERSION +0 -1
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,17 +1,4 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
|
-
# Add dependencies required to use your gem here.
|
3
|
-
# Example:
|
4
|
-
# gem "activesupport", ">= 2.3.5"
|
5
2
|
|
6
|
-
#
|
7
|
-
|
8
|
-
group :development do
|
9
|
-
gem "racc", ">= 0"
|
10
|
-
gem "shoulda", ">= 0"
|
11
|
-
gem "bundler", ">= 1.0.0"
|
12
|
-
gem "jeweler", "~> 1.6.4"
|
13
|
-
gem "rcov", "~> 0.9.11"
|
14
|
-
|
15
|
-
# To test the webrobots/nokogiri module.
|
16
|
-
gem "nokogiri", ">= 1.4.4"
|
17
|
-
end
|
3
|
+
# Specify your gem's dependencies in webrobots.gemspec
|
4
|
+
gemspec
|
data/LICENSE.txt
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -1,45 +1,22 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
3
|
+
require 'bundler/gem_tasks'
|
13
4
|
|
14
|
-
|
15
|
-
Jeweler::Tasks.new do |gem|
|
16
|
-
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.name = "webrobots"
|
18
|
-
gem.homepage = "https://github.com/knu/webrobots"
|
19
|
-
gem.license = "2-clause BSDL"
|
20
|
-
gem.summary = %Q{A Ruby library to help write robots.txt compliant web robots}
|
21
|
-
gem.description = <<-'EOS'
|
22
|
-
This library helps write robots.txt compliant web robots in Ruby.
|
23
|
-
EOS
|
24
|
-
gem.email = "knu@idaemons.org"
|
25
|
-
gem.authors = ["Akinori MUSHA"]
|
26
|
-
# dependencies defined in Gemfile
|
27
|
-
end
|
28
|
-
Jeweler::RubygemsDotOrgTasks.new
|
5
|
+
gemspec = Bundler::GemHelper.gemspec
|
29
6
|
|
30
7
|
require 'rake/testtask'
|
31
8
|
Rake::TestTask.new(:test) do |test|
|
32
|
-
test.libs << '
|
33
|
-
test.
|
9
|
+
test.libs << 'test'
|
10
|
+
test.test_files = gemspec.test_files
|
34
11
|
test.verbose = true
|
35
12
|
end
|
36
13
|
|
37
|
-
require '
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
14
|
+
require 'rdoc/task'
|
15
|
+
Rake::RDocTask.new do |rdoc|
|
16
|
+
rdoc.rdoc_dir = 'rdoc'
|
17
|
+
rdoc.title = "#{gemspec.name} #{gemspec.version}"
|
18
|
+
rdoc.rdoc_files.include(gemspec.extra_rdoc_files)
|
19
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
43
20
|
end
|
44
21
|
|
45
22
|
task :default => :test
|
@@ -49,13 +26,3 @@ task :test => 'lib/webrobots/robotstxt.rb'
|
|
49
26
|
file 'lib/webrobots/robotstxt.rb' => 'lib/webrobots/robotstxt.ry' do
|
50
27
|
sh 'racc', '-o', 'lib/webrobots/robotstxt.rb', 'lib/webrobots/robotstxt.ry'
|
51
28
|
end
|
52
|
-
|
53
|
-
require 'rake/rdoctask'
|
54
|
-
Rake::RDocTask.new do |rdoc|
|
55
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
-
|
57
|
-
rdoc.rdoc_dir = 'rdoc'
|
58
|
-
rdoc.title = "webrobots #{version}"
|
59
|
-
rdoc.rdoc_files.include('README*')
|
60
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
-
end
|
data/lib/webrobots.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'webrobots/version'
|
1
2
|
require 'webrobots/robotstxt'
|
2
3
|
require 'uri'
|
3
4
|
require 'net/https'
|
@@ -17,13 +18,34 @@ class WebRobots
|
|
17
18
|
# the response body if successful, return an empty string if the
|
18
19
|
# resource is not found, and return nil or raise any error on
|
19
20
|
# failure. Redirects should be handled within this proc.
|
21
|
+
#
|
22
|
+
# * :crawl_delay => determines how to react to Crawl-delay
|
23
|
+
# directives. If +:sleep+ is given, WebRobots sleeps as demanded
|
24
|
+
# when allowed?(url)/disallowed?(url) is called. This is the
|
25
|
+
# default behavior. If +:ignore+ is given, WebRobots does
|
26
|
+
# nothing. If a custom method, proc, or anything that responds to
|
27
|
+
# .call(delay, last_checked_at), it is called.
|
20
28
|
def initialize(user_agent, options = nil)
|
21
29
|
@user_agent = user_agent
|
22
|
-
@parser = RobotsTxt::Parser.new(user_agent)
|
23
|
-
@parser_mutex = Mutex.new
|
24
30
|
|
25
31
|
options ||= {}
|
26
32
|
@http_get = options[:http_get] || method(:http_get)
|
33
|
+
crawl_delay_handler =
|
34
|
+
case value = options[:crawl_delay] || :sleep
|
35
|
+
when :ignore
|
36
|
+
nil
|
37
|
+
when :sleep
|
38
|
+
method(:crawl_delay_handler)
|
39
|
+
else
|
40
|
+
if value.respond_to?(:call)
|
41
|
+
value
|
42
|
+
else
|
43
|
+
raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
@parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler)
|
48
|
+
@parser_mutex = Mutex.new
|
27
49
|
|
28
50
|
@robotstxt = create_cache()
|
29
51
|
end
|
@@ -57,6 +79,13 @@ class WebRobots
|
|
57
79
|
!allowed?(url)
|
58
80
|
end
|
59
81
|
|
82
|
+
# Returns the number of seconds that the configured agent should wait
|
83
|
+
# between successive requests to the site identified by +url+ according
|
84
|
+
# to the site's robots.txt +Crawl-delay+ directive.
|
85
|
+
def crawl_delay(url)
|
86
|
+
robots_txt_for(url).crawl_delay()
|
87
|
+
end
|
88
|
+
|
60
89
|
# Returns extended option values for a resource at +url+ in a hash
|
61
90
|
# with each field name lower-cased. See allowed?() for a list of
|
62
91
|
# errors that may be raised.
|
@@ -169,4 +198,11 @@ class WebRobots
|
|
169
198
|
}
|
170
199
|
raise 'too many HTTP redirects'
|
171
200
|
end
|
201
|
+
|
202
|
+
def crawl_delay_handler(delay, last_checked_at)
|
203
|
+
if last_checked_at
|
204
|
+
delay -= Time.now - last_checked_at
|
205
|
+
sleep delay if delay > 0
|
206
|
+
end
|
207
|
+
end
|
172
208
|
end
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.4.
|
3
|
+
# This file is automatically generated by Racc 1.4.9
|
4
4
|
# from Racc grammer file "".
|
5
5
|
#
|
6
6
|
|
@@ -30,17 +30,21 @@ class WebRobots
|
|
30
30
|
class RobotsTxt
|
31
31
|
class Parser < Racc::Parser
|
32
32
|
|
33
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
33
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 171)
|
34
34
|
|
35
|
-
def initialize(target = nil)
|
35
|
+
def initialize(target, crawl_delay_handler = nil)
|
36
36
|
super()
|
37
37
|
@target = target
|
38
|
+
@crawl_delay_handler = crawl_delay_handler
|
38
39
|
end
|
39
40
|
|
40
41
|
def parse!(input, site)
|
41
42
|
parse(input, site)
|
42
43
|
rescue Error => e
|
43
|
-
RobotsTxt.new(site, nil,
|
44
|
+
RobotsTxt.new(site, nil,
|
45
|
+
:error => e,
|
46
|
+
:target => @target,
|
47
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
44
48
|
end
|
45
49
|
|
46
50
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -334,7 +338,9 @@ module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
|
334
338
|
def _reduce_2(val, _values, result)
|
335
339
|
body = val[2]
|
336
340
|
result = RobotsTxt.new(@site, body,
|
337
|
-
:target => @target,
|
341
|
+
:target => @target,
|
342
|
+
:sitemaps => @sitemaps,
|
343
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
338
344
|
|
339
345
|
result
|
340
346
|
end
|
@@ -368,7 +374,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
|
368
374
|
|
369
375
|
# reduce 16 omitted
|
370
376
|
|
371
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
377
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 44)
|
372
378
|
def _reduce_17(val, _values, result)
|
373
379
|
@sitemaps << val[3]
|
374
380
|
|
@@ -376,7 +382,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 42)
|
|
376
382
|
end
|
377
383
|
.,.,
|
378
384
|
|
379
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
385
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 49)
|
380
386
|
def _reduce_18(val, _values, result)
|
381
387
|
result = []
|
382
388
|
result << val[0]
|
@@ -385,7 +391,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 47)
|
|
385
391
|
end
|
386
392
|
.,.,
|
387
393
|
|
388
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
394
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 54)
|
389
395
|
def _reduce_19(val, _values, result)
|
390
396
|
result = []
|
391
397
|
|
@@ -393,7 +399,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 52)
|
|
393
399
|
end
|
394
400
|
.,.,
|
395
401
|
|
396
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
402
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 60)
|
397
403
|
def _reduce_20(val, _values, result)
|
398
404
|
result << val[2]
|
399
405
|
|
@@ -401,7 +407,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 58)
|
|
401
407
|
end
|
402
408
|
.,.,
|
403
409
|
|
404
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
410
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 66)
|
405
411
|
def _reduce_21(val, _values, result)
|
406
412
|
val[2].each_with_index { |line, i|
|
407
413
|
warn "%s line %d: %s: orphan rule line" %
|
@@ -416,7 +422,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 64)
|
|
416
422
|
|
417
423
|
# reduce 23 omitted
|
418
424
|
|
419
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
425
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 81)
|
420
426
|
def _reduce_24(val, _values, result)
|
421
427
|
result = Record.new(val[1], val[2])
|
422
428
|
|
@@ -424,7 +430,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 79)
|
|
424
430
|
end
|
425
431
|
.,.,
|
426
432
|
|
427
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
433
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 86)
|
428
434
|
def _reduce_25(val, _values, result)
|
429
435
|
result = [val[0]]
|
430
436
|
|
@@ -432,7 +438,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 84)
|
|
432
438
|
end
|
433
439
|
.,.,
|
434
440
|
|
435
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
441
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 91)
|
436
442
|
def _reduce_26(val, _values, result)
|
437
443
|
result << val[1]
|
438
444
|
|
@@ -442,7 +448,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 89)
|
|
442
448
|
|
443
449
|
# reduce 27 omitted
|
444
450
|
|
445
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
451
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 98)
|
446
452
|
def _reduce_28(val, _values, result)
|
447
453
|
result = AgentLine.new(val[0], val[3])
|
448
454
|
|
@@ -454,7 +460,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 96)
|
|
454
460
|
|
455
461
|
# reduce 30 omitted
|
456
462
|
|
457
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
463
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 106)
|
458
464
|
def _reduce_31(val, _values, result)
|
459
465
|
result = [result]
|
460
466
|
@rulelinenos = []
|
@@ -463,7 +469,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 104)
|
|
463
469
|
end
|
464
470
|
.,.,
|
465
471
|
|
466
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
472
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 112)
|
467
473
|
def _reduce_32(val, _values, result)
|
468
474
|
result << val[1]
|
469
475
|
@rulelinenos << @lineno
|
@@ -482,7 +488,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 110)
|
|
482
488
|
|
483
489
|
# reduce 37 omitted
|
484
490
|
|
485
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
491
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 125)
|
486
492
|
def _reduce_38(val, _values, result)
|
487
493
|
result = AllowLine.new(val[0], val[3])
|
488
494
|
|
@@ -490,7 +496,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 123)
|
|
490
496
|
end
|
491
497
|
.,.,
|
492
498
|
|
493
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
499
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 130)
|
494
500
|
def _reduce_39(val, _values, result)
|
495
501
|
result = DisallowLine.new(val[0], val[3])
|
496
502
|
|
@@ -498,7 +504,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 128)
|
|
498
504
|
end
|
499
505
|
.,.,
|
500
506
|
|
501
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
507
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 135)
|
502
508
|
def _reduce_40(val, _values, result)
|
503
509
|
result = CrawlDelayLine.new(val[0], val[3])
|
504
510
|
|
@@ -506,7 +512,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 133)
|
|
506
512
|
end
|
507
513
|
.,.,
|
508
514
|
|
509
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
515
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 140)
|
510
516
|
def _reduce_41(val, _values, result)
|
511
517
|
result = ExtentionLine.new(val[0], val[3])
|
512
518
|
|
@@ -528,11 +534,12 @@ end # class Parser
|
|
528
534
|
@timestamp = Time.now
|
529
535
|
@site = site
|
530
536
|
@options = options || {}
|
531
|
-
@
|
537
|
+
@last_checked_at = nil
|
532
538
|
|
533
539
|
@error = @options[:error]
|
534
540
|
@target = @options[:target]
|
535
541
|
@sitemaps = @options[:sitemaps] || []
|
542
|
+
@crawl_delay_handler = @options[:crawl_delay_handler]
|
536
543
|
|
537
544
|
if records && !records.empty?
|
538
545
|
@records, defaults = [], []
|
@@ -578,14 +585,18 @@ end # class Parser
|
|
578
585
|
def allow?(request_uri, user_agent = nil)
|
579
586
|
record = find_record(user_agent) or return true
|
580
587
|
allow = record.allow?(request_uri)
|
581
|
-
if
|
582
|
-
delay
|
583
|
-
sleep delay if delay > 0
|
588
|
+
if delay = record.delay and @crawl_delay_handler
|
589
|
+
@crawl_delay_handler.call(delay, @last_checked_at)
|
584
590
|
end
|
585
|
-
@
|
591
|
+
@last_checked_at = Time.now
|
586
592
|
return allow
|
587
593
|
end
|
588
594
|
|
595
|
+
def crawl_delay(user_agent = nil)
|
596
|
+
record = find_record(user_agent) or return 0
|
597
|
+
record.delay or return 0
|
598
|
+
end
|
599
|
+
|
589
600
|
def options(user_agent = nil)
|
590
601
|
record = find_record(user_agent) or return {}
|
591
602
|
record.options
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -11,7 +11,9 @@ rule
|
|
11
11
|
{
|
12
12
|
body = val[2]
|
13
13
|
result = RobotsTxt.new(@site, body,
|
14
|
-
:target => @target,
|
14
|
+
:target => @target,
|
15
|
+
:sitemaps => @sitemaps,
|
16
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
15
17
|
}
|
16
18
|
|
17
19
|
body :
|
@@ -167,15 +169,19 @@ class WebRobots
|
|
167
169
|
class RobotsTxt
|
168
170
|
---- inner
|
169
171
|
|
170
|
-
def initialize(target = nil)
|
172
|
+
def initialize(target, crawl_delay_handler = nil)
|
171
173
|
super()
|
172
174
|
@target = target
|
175
|
+
@crawl_delay_handler = crawl_delay_handler
|
173
176
|
end
|
174
177
|
|
175
178
|
def parse!(input, site)
|
176
179
|
parse(input, site)
|
177
180
|
rescue Error => e
|
178
|
-
RobotsTxt.new(site, nil,
|
181
|
+
RobotsTxt.new(site, nil,
|
182
|
+
:error => e,
|
183
|
+
:target => @target,
|
184
|
+
:crawl_delay_handler => @crawl_delay_handler)
|
179
185
|
end
|
180
186
|
|
181
187
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
@@ -263,11 +269,12 @@ class WebRobots
|
|
263
269
|
@timestamp = Time.now
|
264
270
|
@site = site
|
265
271
|
@options = options || {}
|
266
|
-
@
|
272
|
+
@last_checked_at = nil
|
267
273
|
|
268
274
|
@error = @options[:error]
|
269
275
|
@target = @options[:target]
|
270
276
|
@sitemaps = @options[:sitemaps] || []
|
277
|
+
@crawl_delay_handler = @options[:crawl_delay_handler]
|
271
278
|
|
272
279
|
if records && !records.empty?
|
273
280
|
@records, defaults = [], []
|
@@ -313,14 +320,18 @@ class WebRobots
|
|
313
320
|
def allow?(request_uri, user_agent = nil)
|
314
321
|
record = find_record(user_agent) or return true
|
315
322
|
allow = record.allow?(request_uri)
|
316
|
-
if
|
317
|
-
delay
|
318
|
-
sleep delay if delay > 0
|
323
|
+
if delay = record.delay and @crawl_delay_handler
|
324
|
+
@crawl_delay_handler.call(delay, @last_checked_at)
|
319
325
|
end
|
320
|
-
@
|
326
|
+
@last_checked_at = Time.now
|
321
327
|
return allow
|
322
328
|
end
|
323
329
|
|
330
|
+
def crawl_delay(user_agent = nil)
|
331
|
+
record = find_record(user_agent) or return 0
|
332
|
+
record.delay or return 0
|
333
|
+
end
|
334
|
+
|
324
335
|
def options(user_agent = nil)
|
325
336
|
record = find_record(user_agent) or return {}
|
326
337
|
record.options
|
data/test/test_webrobots.rb
CHANGED
@@ -384,6 +384,12 @@ Option1: Foo
|
|
384
384
|
Option2: Hello
|
385
385
|
Crawl-Delay: 1.5
|
386
386
|
|
387
|
+
User-Agent: HerBot
|
388
|
+
Disallow: /2heavy/
|
389
|
+
Allow: /2heavy/*.html
|
390
|
+
Option1: Baz
|
391
|
+
Option2: Qux
|
392
|
+
|
387
393
|
User-Agent: *
|
388
394
|
Disallow: /2heavy/
|
389
395
|
Allow: /2heavy/*.html
|
@@ -400,6 +406,9 @@ Option3: Hi
|
|
400
406
|
}
|
401
407
|
|
402
408
|
@robots_mybot = WebRobots.new('MyBot', :http_get => http_get)
|
409
|
+
@robots_mybot_ignore = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => :ignore)
|
410
|
+
@robots_mybot_custom = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => proc { |*args| @delay_args = args })
|
411
|
+
@robots_herbot = WebRobots.new('HerBot', :http_get => http_get)
|
403
412
|
@robots_hisbot = WebRobots.new('HisBot', :http_get => http_get)
|
404
413
|
end
|
405
414
|
|
@@ -411,6 +420,27 @@ Option3: Hi
|
|
411
420
|
assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2')
|
412
421
|
assert_equal 'Hello', options['option2']
|
413
422
|
|
423
|
+
options = @robots_mybot_ignore.options('http://www.example.org/')
|
424
|
+
assert_equal 2, options.size
|
425
|
+
assert_equal 'Foo', @robots_mybot_ignore.option('http://www.example.org/', 'Option1')
|
426
|
+
assert_equal 'Foo', options['option1']
|
427
|
+
assert_equal 'Hello', @robots_mybot_ignore.option('http://www.example.org/', 'Option2')
|
428
|
+
assert_equal 'Hello', options['option2']
|
429
|
+
|
430
|
+
options = @robots_mybot_custom.options('http://www.example.org/')
|
431
|
+
assert_equal 2, options.size
|
432
|
+
assert_equal 'Foo', @robots_mybot_custom.option('http://www.example.org/', 'Option1')
|
433
|
+
assert_equal 'Foo', options['option1']
|
434
|
+
assert_equal 'Hello', @robots_mybot_custom.option('http://www.example.org/', 'Option2')
|
435
|
+
assert_equal 'Hello', options['option2']
|
436
|
+
|
437
|
+
options = @robots_herbot.options('http://www.example.org/')
|
438
|
+
assert_equal 2, options.size
|
439
|
+
assert_equal 'Baz', @robots_herbot.option('http://www.example.org/', 'Option1')
|
440
|
+
assert_equal 'Baz', options['option1']
|
441
|
+
assert_equal 'Qux', @robots_herbot.option('http://www.example.org/', 'Option2')
|
442
|
+
assert_equal 'Qux', options['option2']
|
443
|
+
|
414
444
|
options = @robots_hisbot.options('http://www.example.org/')
|
415
445
|
assert_equal 2, options.size
|
416
446
|
assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1')
|
@@ -422,11 +452,25 @@ Option3: Hi
|
|
422
452
|
http://www.example.org/sitemap-host1.xml
|
423
453
|
http://www.example.org/sitemap-host2.xml
|
424
454
|
], @robots_mybot.sitemaps('http://www.example.org/')
|
455
|
+
assert_equal %w[
|
456
|
+
http://www.example.org/sitemap-host1.xml
|
457
|
+
http://www.example.org/sitemap-host2.xml
|
458
|
+
], @robots_mybot_ignore.sitemaps('http://www.example.org/')
|
459
|
+
assert_equal %w[
|
460
|
+
http://www.example.org/sitemap-host1.xml
|
461
|
+
http://www.example.org/sitemap-host2.xml
|
462
|
+
], @robots_herbot.sitemaps('http://www.example.org/')
|
425
463
|
assert_equal %w[
|
426
464
|
http://www.example.org/sitemap-host1.xml
|
427
465
|
http://www.example.org/sitemap-host2.xml
|
428
466
|
], @robots_hisbot.sitemaps('http://www.example.org/')
|
429
467
|
|
468
|
+
assert_equal 1.5, @robots_mybot.crawl_delay('http://www.example.org/')
|
469
|
+
assert_equal 1.5, @robots_mybot_ignore.crawl_delay('http://www.example.org/')
|
470
|
+
assert_equal 1.5, @robots_mybot_custom.crawl_delay('http://www.example.org/')
|
471
|
+
assert_equal 0, @robots_herbot.crawl_delay('http://www.example.org/')
|
472
|
+
assert_equal 0, @robots_hisbot.crawl_delay('http://www.example.org/')
|
473
|
+
|
430
474
|
t1 = Time.now
|
431
475
|
@robots_mybot.allowed?('http://www.example.org/')
|
432
476
|
@robots_mybot.allowed?('http://www.example.org/article1.html')
|
@@ -435,6 +479,25 @@ Option3: Hi
|
|
435
479
|
@robots_mybot.allowed?('http://www.example.org/article2.html')
|
436
480
|
t3 = Time.now
|
437
481
|
assert_in_delta 1.5, t3 - t2, 0.1
|
482
|
+
|
483
|
+
t1 = Time.now
|
484
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/')
|
485
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/article1.html')
|
486
|
+
t2 = Time.now
|
487
|
+
assert_in_delta 0, t2 - t1, 0.1
|
488
|
+
@robots_mybot_ignore.allowed?('http://www.example.org/article2.html')
|
489
|
+
t3 = Time.now
|
490
|
+
assert_in_delta 0, t3 - t2, 0.1
|
491
|
+
|
492
|
+
t1 = Time.now
|
493
|
+
@robots_mybot_custom.allowed?('http://www.example.org/')
|
494
|
+
@robots_mybot_custom.allowed?('http://www.example.org/article1.html')
|
495
|
+
t2 = Time.now
|
496
|
+
assert_in_delta 0, t2 - t1, 0.1
|
497
|
+
assert_instance_of Array, @delay_args
|
498
|
+
assert_equal 2, @delay_args.size
|
499
|
+
assert_equal 1.5, @delay_args[0]
|
500
|
+
assert_instance_of Time, @delay_args[1]
|
438
501
|
end
|
439
502
|
end
|
440
503
|
|
data/webrobots.gemspec
CHANGED
@@ -1,68 +1,33 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "webrobots/version"
|
5
4
|
|
6
5
|
Gem::Specification.new do |s|
|
7
|
-
s.name
|
8
|
-
s.version
|
6
|
+
s.name = "webrobots"
|
7
|
+
s.version = Webrobots::VERSION
|
8
|
+
s.authors = ["Akinori MUSHA"]
|
9
|
+
s.email = ["knu@idaemons.org"]
|
10
|
+
s.homepage = %q{https://github.com/knu/webrobots}
|
11
|
+
s.licenses = [%q{2-clause BSDL}]
|
12
|
+
s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
|
13
|
+
s.description = <<-'EOS'
|
14
|
+
This library helps write robots.txt compliant web robots in Ruby.
|
15
|
+
EOS
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = s.files.grep(%r{/test_[^/]+\.rb$})
|
19
|
+
s.executables = s.files.grep(%r{^bin/[^.]}).map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
9
21
|
|
10
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = "2012-01-24"
|
13
|
-
s.description = "This library helps write robots.txt compliant web robots in Ruby.\n"
|
14
|
-
s.email = "knu@idaemons.org"
|
15
22
|
s.extra_rdoc_files = [
|
16
23
|
"LICENSE.txt",
|
17
24
|
"README.rdoc"
|
18
25
|
]
|
19
|
-
s.files = [
|
20
|
-
".document",
|
21
|
-
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
|
-
"LICENSE.txt",
|
24
|
-
"README.rdoc",
|
25
|
-
"Rakefile",
|
26
|
-
"VERSION",
|
27
|
-
"lib/webrobots.rb",
|
28
|
-
"lib/webrobots/nokogiri.rb",
|
29
|
-
"lib/webrobots/robotstxt.rb",
|
30
|
-
"lib/webrobots/robotstxt.ry",
|
31
|
-
"test/helper.rb",
|
32
|
-
"test/test_webrobots.rb",
|
33
|
-
"webrobots.gemspec"
|
34
|
-
]
|
35
|
-
s.homepage = "https://github.com/knu/webrobots"
|
36
|
-
s.licenses = ["2-clause BSDL"]
|
37
|
-
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = "1.8.15"
|
39
|
-
s.summary = "A Ruby library to help write robots.txt compliant web robots"
|
40
|
-
|
41
|
-
if s.respond_to? :specification_version then
|
42
|
-
s.specification_version = 3
|
43
26
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
s.add_development_dependency(%q<nokogiri>, [">= 1.4.4"])
|
51
|
-
else
|
52
|
-
s.add_dependency(%q<racc>, [">= 0"])
|
53
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
54
|
-
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
55
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
56
|
-
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
57
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
58
|
-
end
|
59
|
-
else
|
60
|
-
s.add_dependency(%q<racc>, [">= 0"])
|
61
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
62
|
-
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
63
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
64
|
-
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
65
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
66
|
-
end
|
27
|
+
s.add_development_dependency("rake", [">= 0.9.2.2"])
|
28
|
+
s.add_development_dependency("racc", [">= 0"]) unless RUBY_PLATFORM == "java"
|
29
|
+
s.add_development_dependency("shoulda", [">= 0"])
|
30
|
+
s.add_development_dependency("rdoc", ["> 2.4.2"])
|
31
|
+
s.add_development_dependency("bundler", [">= 1.2"])
|
32
|
+
s.add_development_dependency("nokogiri", [">= 1.4.4"])
|
67
33
|
end
|
68
|
-
|
metadata
CHANGED
@@ -1,172 +1,162 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 13
|
10
|
-
version: 0.0.13
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Akinori MUSHA
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
requirement:
|
12
|
+
date: 2013-02-15 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
22
17
|
none: false
|
23
|
-
requirements:
|
24
|
-
- -
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
|
27
|
-
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
version_requirements: *id001
|
31
|
-
name: racc
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.9.2.2
|
22
|
+
type: :development
|
32
23
|
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.9.2.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: racc
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
33
38
|
type: :development
|
34
|
-
|
35
|
-
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
41
|
none: false
|
37
|
-
requirements:
|
38
|
-
- -
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
|
41
|
-
|
42
|
-
- 0
|
43
|
-
version: "0"
|
44
|
-
version_requirements: *id002
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
45
47
|
name: shoulda
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
46
55
|
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rdoc
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>'
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 2.4.2
|
47
70
|
type: :development
|
48
|
-
|
49
|
-
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
73
|
none: false
|
51
|
-
requirements:
|
52
|
-
- -
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
|
55
|
-
|
56
|
-
- 1
|
57
|
-
- 0
|
58
|
-
- 0
|
59
|
-
version: 1.0.0
|
60
|
-
version_requirements: *id003
|
74
|
+
requirements:
|
75
|
+
- - ! '>'
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 2.4.2
|
78
|
+
- !ruby/object:Gem::Dependency
|
61
79
|
name: bundler
|
62
|
-
|
63
|
-
type: :development
|
64
|
-
- !ruby/object:Gem::Dependency
|
65
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
66
81
|
none: false
|
67
|
-
requirements:
|
68
|
-
- -
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
|
71
|
-
segments:
|
72
|
-
- 1
|
73
|
-
- 6
|
74
|
-
- 4
|
75
|
-
version: 1.6.4
|
76
|
-
version_requirements: *id004
|
77
|
-
name: jeweler
|
78
|
-
prerelease: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '1.2'
|
79
86
|
type: :development
|
80
|
-
- !ruby/object:Gem::Dependency
|
81
|
-
requirement: &id005 !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
|
-
requirements:
|
84
|
-
- - ~>
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
hash: 45
|
87
|
-
segments:
|
88
|
-
- 0
|
89
|
-
- 9
|
90
|
-
- 11
|
91
|
-
version: 0.9.11
|
92
|
-
version_requirements: *id005
|
93
|
-
name: rcov
|
94
87
|
prerelease: false
|
95
|
-
|
96
|
-
- !ruby/object:Gem::Dependency
|
97
|
-
requirement: &id006 !ruby/object:Gem::Requirement
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
98
89
|
none: false
|
99
|
-
requirements:
|
100
|
-
- -
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
|
103
|
-
|
104
|
-
- 1
|
105
|
-
- 4
|
106
|
-
- 4
|
107
|
-
version: 1.4.4
|
108
|
-
version_requirements: *id006
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '1.2'
|
94
|
+
- !ruby/object:Gem::Dependency
|
109
95
|
name: nokogiri
|
110
|
-
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.4.4
|
111
102
|
type: :development
|
112
|
-
|
113
|
-
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.4.4
|
110
|
+
description: ! 'This library helps write robots.txt compliant web robots in Ruby.
|
114
111
|
|
115
|
-
|
112
|
+
'
|
113
|
+
email:
|
114
|
+
- knu@idaemons.org
|
116
115
|
executables: []
|
117
|
-
|
118
116
|
extensions: []
|
119
|
-
|
120
|
-
extra_rdoc_files:
|
117
|
+
extra_rdoc_files:
|
121
118
|
- LICENSE.txt
|
122
119
|
- README.rdoc
|
123
|
-
files:
|
120
|
+
files:
|
124
121
|
- .document
|
122
|
+
- .gitignore
|
123
|
+
- .travis.yml
|
125
124
|
- Gemfile
|
126
|
-
- Gemfile.lock
|
127
125
|
- LICENSE.txt
|
128
126
|
- README.rdoc
|
129
127
|
- Rakefile
|
130
|
-
- VERSION
|
131
128
|
- lib/webrobots.rb
|
132
129
|
- lib/webrobots/nokogiri.rb
|
133
130
|
- lib/webrobots/robotstxt.rb
|
134
131
|
- lib/webrobots/robotstxt.ry
|
132
|
+
- lib/webrobots/version.rb
|
135
133
|
- test/helper.rb
|
136
134
|
- test/test_webrobots.rb
|
137
135
|
- webrobots.gemspec
|
138
136
|
homepage: https://github.com/knu/webrobots
|
139
|
-
licenses:
|
137
|
+
licenses:
|
140
138
|
- 2-clause BSDL
|
141
139
|
post_install_message:
|
142
140
|
rdoc_options: []
|
143
|
-
|
144
|
-
require_paths:
|
141
|
+
require_paths:
|
145
142
|
- lib
|
146
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
143
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
147
144
|
none: false
|
148
|
-
requirements:
|
149
|
-
- -
|
150
|
-
- !ruby/object:Gem::Version
|
151
|
-
|
152
|
-
|
153
|
-
- 0
|
154
|
-
version: "0"
|
155
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
|
+
requirements:
|
146
|
+
- - ! '>='
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
149
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
156
150
|
none: false
|
157
|
-
requirements:
|
158
|
-
- -
|
159
|
-
- !ruby/object:Gem::Version
|
160
|
-
|
161
|
-
segments:
|
162
|
-
- 0
|
163
|
-
version: "0"
|
151
|
+
requirements:
|
152
|
+
- - ! '>='
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
164
155
|
requirements: []
|
165
|
-
|
166
156
|
rubyforge_project:
|
167
|
-
rubygems_version: 1.8.
|
157
|
+
rubygems_version: 1.8.24
|
168
158
|
signing_key:
|
169
159
|
specification_version: 3
|
170
160
|
summary: A Ruby library to help write robots.txt compliant web robots
|
171
|
-
test_files:
|
172
|
-
|
161
|
+
test_files:
|
162
|
+
- test/test_webrobots.rb
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
git (1.2.5)
|
5
|
-
jeweler (1.6.4)
|
6
|
-
bundler (~> 1.0)
|
7
|
-
git (>= 1.2.5)
|
8
|
-
rake
|
9
|
-
nokogiri (1.5.0)
|
10
|
-
racc (1.4.7)
|
11
|
-
rake (0.9.2.2)
|
12
|
-
rcov (0.9.11)
|
13
|
-
shoulda (2.11.3)
|
14
|
-
|
15
|
-
PLATFORMS
|
16
|
-
ruby
|
17
|
-
|
18
|
-
DEPENDENCIES
|
19
|
-
bundler (>= 1.0.0)
|
20
|
-
jeweler (~> 1.6.4)
|
21
|
-
nokogiri (>= 1.4.4)
|
22
|
-
racc
|
23
|
-
rcov (~> 0.9.11)
|
24
|
-
shoulda
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.13
|