scrape 0.1.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +2 -1
- data/lib/scrape.rb +5 -2
- data/lib/scrape/application.rb +11 -3
- data/lib/scrape/cli.rb +9 -7
- data/lib/scrape/robots_txt.rb +54 -0
- data/lib/scrape/robots_txt_rules.rb +24 -0
- data/lib/scrape/site.rb +24 -11
- data/lib/scrape/string_ext.rb +6 -0
- data/lib/scrape/version.rb +1 -1
- data/scrape.gemspec +1 -0
- data/test/unit/application_test.rb +10 -1
- data/test/unit/robots_txt_rules_test.rb +50 -0
- data/test/unit/robots_txt_test.rb +71 -0
- data/test/unit/site_test.rb +51 -7
- metadata +23 -2
data/Gemfile.lock
CHANGED
data/lib/scrape.rb
CHANGED
@@ -3,6 +3,8 @@ require "logger"
|
|
3
3
|
require "open-uri"
|
4
4
|
require "bundler/setup"
|
5
5
|
|
6
|
+
require "scrape/string_ext.rb"
|
7
|
+
|
6
8
|
module Scrape
|
7
9
|
require 'scrape/version'
|
8
10
|
|
@@ -11,9 +13,10 @@ module Scrape
|
|
11
13
|
autoload 'Match', 'scrape/match'
|
12
14
|
autoload 'DefaultLoader', 'scrape/default_loader'
|
13
15
|
autoload 'DSL', 'scrape/dsl'
|
14
|
-
autoload '
|
16
|
+
autoload 'RobotsTxt', 'scrape/robots_txt'
|
17
|
+
autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
|
15
18
|
|
16
|
-
class
|
19
|
+
class FileNotFound < Exception; end
|
17
20
|
|
18
21
|
class << self
|
19
22
|
attr_writer :user_agent
|
data/lib/scrape/application.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
class Scrape::Application
|
2
|
-
attr_reader :scrapefile, :loader, :sites, :history
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
|
3
3
|
|
4
|
-
def initialize scrapefile, loader = Scrape::DefaultLoader.new
|
4
|
+
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
6
|
@loader = loader
|
7
7
|
@sites = {}
|
8
8
|
@queue = []
|
9
9
|
@history = []
|
10
|
+
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
|
10
11
|
end
|
11
12
|
|
12
13
|
def run
|
@@ -43,14 +44,21 @@ class Scrape::Application
|
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
47
|
+
def ignore_robots_txt= bool
|
48
|
+
sites.each{|_, site| site.ignore_robots_txt = bool }
|
49
|
+
@ignore_robots_txt = bool
|
50
|
+
end
|
51
|
+
|
46
52
|
def [] url
|
47
53
|
@sites.values.detect{|site| site.accept? url }
|
48
54
|
end
|
49
55
|
|
50
56
|
def load_scrapefile
|
51
57
|
return if @scrapefile_loaded
|
52
|
-
|
58
|
+
raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
|
59
|
+
result = loader.load scrapefile
|
53
60
|
@sites.update result if result.is_a? Hash
|
61
|
+
self.ignore_robots_txt = ignore_robots_txt
|
54
62
|
reset
|
55
63
|
@scrapefile_loaded = true
|
56
64
|
end
|
data/lib/scrape/cli.rb
CHANGED
@@ -17,6 +17,9 @@ class Scrape::CLI
|
|
17
17
|
opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
|
18
18
|
options[:file] = File.expand_path file
|
19
19
|
end
|
20
|
+
opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
|
21
|
+
options[:ignore_robots_txt] = true
|
22
|
+
end
|
20
23
|
opts.on_tail "-h", "--help", "Show this message" do
|
21
24
|
puts opts
|
22
25
|
exit
|
@@ -28,12 +31,11 @@ class Scrape::CLI
|
|
28
31
|
end
|
29
32
|
opts.parse argv
|
30
33
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
end
|
34
|
+
Scrape::Application.new(options.delete(:file), options).run
|
35
|
+
|
36
|
+
rescue Scrape::FileNotFound
|
37
|
+
puts "#{command} aborted!"
|
38
|
+
puts "No Scrapefile found"
|
39
|
+
exit -1
|
38
40
|
end
|
39
41
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class Scrape::RobotsTxt
|
4
|
+
def initialize rules
|
5
|
+
@rules = rules
|
6
|
+
@rules.default = Scrape::RobotsTxtRules.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def user_agents
|
10
|
+
@rules.keys
|
11
|
+
end
|
12
|
+
|
13
|
+
def disallows
|
14
|
+
@rules.values.flatten
|
15
|
+
end
|
16
|
+
|
17
|
+
def [] user_agent
|
18
|
+
rules = @rules[user_agent].clone
|
19
|
+
rules += @rules['*'] unless user_agent == '*'
|
20
|
+
rules
|
21
|
+
end
|
22
|
+
|
23
|
+
def =~ str
|
24
|
+
self[Scrape.user_agent] =~ str
|
25
|
+
end
|
26
|
+
|
27
|
+
def each &block
|
28
|
+
@rules.each &block
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.parse content
|
32
|
+
rules, user_agent = Hash.new, nil
|
33
|
+
|
34
|
+
content.split("\n").each do |line|
|
35
|
+
case line
|
36
|
+
when /^#/
|
37
|
+
next
|
38
|
+
when /User-agent:\s*(.+)/
|
39
|
+
user_agent = $1.strip
|
40
|
+
rules.update user_agent => Scrape::RobotsTxtRules.new
|
41
|
+
when /Disallow:\s*(.+)/
|
42
|
+
rules[user_agent] << $1.strip
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
new rules
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.load url, default = true
|
50
|
+
url = Addressable::URI.join(url, "/robots.txt") if default
|
51
|
+
parse Scrape.open(url)
|
52
|
+
end
|
53
|
+
public :load
|
54
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Scrape::RobotsTxtRules
|
2
|
+
def initialize *rules
|
3
|
+
@rules = rules.flatten
|
4
|
+
end
|
5
|
+
|
6
|
+
def << rule
|
7
|
+
@rules.push *Array(rule).flatten
|
8
|
+
self
|
9
|
+
end
|
10
|
+
|
11
|
+
def + ary
|
12
|
+
dup << ary.to_ary
|
13
|
+
end
|
14
|
+
|
15
|
+
def =~ str
|
16
|
+
str = str.to_str
|
17
|
+
@rules.any?{|rule| str.starts_with rule }
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_a
|
21
|
+
@rules.dup
|
22
|
+
end
|
23
|
+
alias_method :to_ary, :to_a
|
24
|
+
end
|
data/lib/scrape/site.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
require 'uri'
|
1
|
+
require 'addressable/uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
class Scrape::Site
|
5
5
|
attr_reader :url, :matches
|
6
|
+
attr_accessor :ignore_robots_txt
|
6
7
|
|
7
|
-
def initialize url
|
8
|
-
@url = URI.parse url
|
8
|
+
def initialize url, options = {}
|
9
|
+
@url = Addressable::URI.parse url
|
9
10
|
@url.query = nil
|
10
11
|
@url.fragment = nil
|
11
12
|
@matches = []
|
13
|
+
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
|
12
14
|
end
|
13
15
|
|
14
16
|
def add_match matcher, &proc
|
@@ -23,22 +25,33 @@ class Scrape::Site
|
|
23
25
|
|
24
26
|
@matches.each{|match| match.invoke doc if match =~ url }
|
25
27
|
|
26
|
-
doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
|
28
|
+
doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
|
27
29
|
end
|
28
30
|
|
29
31
|
def accept? url
|
30
|
-
url
|
32
|
+
url = normalize url
|
33
|
+
url.starts_with(to_s) && !disallowed?(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
def normalize url, base_url = self.url
|
37
|
+
Addressable::URI.join(base_url, url).to_s
|
38
|
+
end
|
39
|
+
|
40
|
+
def robots_txt
|
41
|
+
@robots_txt ||= Scrape::RobotsTxt.load url
|
31
42
|
end
|
32
43
|
|
33
|
-
def
|
34
|
-
|
35
|
-
when /^.+:\/\// then url.dup
|
36
|
-
when /^\// then @url.merge(url).to_s
|
37
|
-
else @url.merge("#{@url.path}/#{url}").to_s
|
38
|
-
end
|
44
|
+
def ignore_robots_txt?
|
45
|
+
!!@ignore_robots_txt
|
39
46
|
end
|
40
47
|
|
41
48
|
def to_s
|
42
49
|
url.to_s
|
43
50
|
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def disallowed? url
|
55
|
+
!ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
|
56
|
+
end
|
44
57
|
end
|
data/lib/scrape/version.rb
CHANGED
data/scrape.gemspec
CHANGED
@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
|
|
54
54
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
55
55
|
test_loader = MiniTest::Mock.new
|
56
56
|
test_loader.expect :load, nil, [filepath]
|
57
|
-
Scrape::Application.new(filepath, test_loader).run
|
57
|
+
Scrape::Application.new(filepath, {}, test_loader).run
|
58
58
|
assert test_loader.verify, "loader did not receive file"
|
59
59
|
end
|
60
60
|
|
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
|
|
69
69
|
3.times{ app.enqueue "http://example.com" }
|
70
70
|
assert_equal ["http://example.com"], app.queue
|
71
71
|
end
|
72
|
+
|
73
|
+
test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
|
74
|
+
site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
|
75
|
+
app = Scrape::Application.new(".")
|
76
|
+
app.sites.update site.to_s => site
|
77
|
+
assert_equal false, site.ignore_robots_txt
|
78
|
+
app.ignore_robots_txt = true
|
79
|
+
assert_equal true, site.ignore_robots_txt
|
80
|
+
end
|
72
81
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class RobotsTxtRulesTest < Scrape::TestCase
|
4
|
+
test "#initialize should set the rules passed as multiple arguments" do
|
5
|
+
rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
|
6
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
7
|
+
end
|
8
|
+
|
9
|
+
test "#initialize should set the rules passed an array argument" do
|
10
|
+
rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
|
11
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
12
|
+
end
|
13
|
+
|
14
|
+
test "#<< should append the string" do
|
15
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
16
|
+
assert_equal ["/foo"], rules.to_a
|
17
|
+
rules << "/bar"
|
18
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
19
|
+
end
|
20
|
+
|
21
|
+
test "#<< should append the array" do
|
22
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
23
|
+
assert_equal ["/foo"], rules.to_a
|
24
|
+
rules << ["/bar", "/too"]
|
25
|
+
assert_equal ["/foo", "/bar", "/too"], rules.to_a
|
26
|
+
end
|
27
|
+
|
28
|
+
test "#+ should return a new instance concatenating it self and the given array" do
|
29
|
+
rules1 = Scrape::RobotsTxtRules.new "/foo"
|
30
|
+
rules2 = rules1 + ["/bar"]
|
31
|
+
refute_equal rules1, rules2
|
32
|
+
assert_kind_of Scrape::RobotsTxtRules, rules2
|
33
|
+
assert_equal ["/foo", "/bar"], rules2.to_a
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#=~ should match anything that beings with /" do
|
37
|
+
rules = Scrape::RobotsTxtRules.new "/"
|
38
|
+
assert rules =~ "/"
|
39
|
+
assert rules =~ "/foo"
|
40
|
+
end
|
41
|
+
|
42
|
+
test "#=~ should match anything that begins with rules" do
|
43
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
44
|
+
assert rules =~ "/foo"
|
45
|
+
assert rules =~ "/foo/"
|
46
|
+
assert rules =~ "/foo/bar"
|
47
|
+
assert rules =~ "/foo.html"
|
48
|
+
refute rules =~ "/bar"
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class RobotsTxtTest < Scrape::TestCase
|
4
|
+
test "#user_agents should return an array" do
|
5
|
+
robots = Scrape::RobotsTxt.new "Test" => []
|
6
|
+
assert_equal ["Test"], robots.user_agents
|
7
|
+
end
|
8
|
+
|
9
|
+
test "#disallows should return an array" do
|
10
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
|
11
|
+
assert_equal ["/foo"], robots.disallows
|
12
|
+
end
|
13
|
+
|
14
|
+
test "#[] should return all disallows for the specified user agent" do
|
15
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
|
16
|
+
assert_equal ["/foo"], robots["Test"]
|
17
|
+
end
|
18
|
+
|
19
|
+
test "#[] should return all disallows for the specified user agent including wildcard" do
|
20
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
|
21
|
+
assert_equal ["/foo", "/bar"], robots["Test"]
|
22
|
+
end
|
23
|
+
|
24
|
+
test "#[] should return all disallows for wildcard" do
|
25
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
|
26
|
+
assert_equal ["/bar"], robots["*"]
|
27
|
+
end
|
28
|
+
|
29
|
+
test ".parse should return new instance parsed from a string" do
|
30
|
+
robots = Scrape::RobotsTxt.parse <<-TXT
|
31
|
+
User-agent: Test
|
32
|
+
Disallow: /foo
|
33
|
+
Disallow: /bar
|
34
|
+
TXT
|
35
|
+
|
36
|
+
assert_equal ["Test"], robots.user_agents
|
37
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
38
|
+
end
|
39
|
+
|
40
|
+
test ".parse should return new empty instance" do
|
41
|
+
robots = Scrape::RobotsTxt.parse ""
|
42
|
+
assert_equal [], robots.user_agents
|
43
|
+
assert_equal [], robots.disallows
|
44
|
+
end
|
45
|
+
|
46
|
+
test ".load should return a new instance parsed from the specified url" do
|
47
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
48
|
+
to_return(:status => 200, :body => <<-TXT)
|
49
|
+
User-agent: Test
|
50
|
+
Disallow: /foo
|
51
|
+
Disallow: /bar
|
52
|
+
TXT
|
53
|
+
|
54
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
|
55
|
+
assert_equal ["Test"], robots.user_agents
|
56
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
57
|
+
end
|
58
|
+
|
59
|
+
test ".load should return a new instance parsed from the specified url with the path defaulted" do
|
60
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
61
|
+
to_return(:status => 200, :body => <<-TXT)
|
62
|
+
User-agent: Test
|
63
|
+
Disallow: /foo
|
64
|
+
Disallow: /bar
|
65
|
+
TXT
|
66
|
+
|
67
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
|
68
|
+
assert_equal ["Test"], robots.user_agents
|
69
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
70
|
+
end
|
71
|
+
end
|
data/test/unit/site_test.rb
CHANGED
@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
|
|
24
24
|
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
25
25
|
end
|
26
26
|
|
27
|
-
test "#parse should return relative urls to the
|
28
|
-
stub_request(:get, "http://www.example.com/
|
27
|
+
test "#parse should return relative urls to the specified url" do
|
28
|
+
stub_request(:get, "http://www.example.com/foo/bar").
|
29
29
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
30
30
|
to_return(:status => 200, :body => <<-HTML)
|
31
31
|
<html>
|
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
|
|
36
36
|
HTML
|
37
37
|
|
38
38
|
site = Scrape::Site.new "http://www.example.com"
|
39
|
-
assert_equal ["http://www.example.com/link1.html"], site.parse("/
|
39
|
+
assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
|
40
40
|
end
|
41
41
|
|
42
42
|
test "#parse should return no urls" do
|
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
|
|
73
73
|
assert ok, "Match was not invoked"
|
74
74
|
end
|
75
75
|
|
76
|
-
test "#accept? should return true when specified url inside the site's url" do
|
77
|
-
|
78
|
-
assert
|
76
|
+
test "#accept? should return true when specified url is inside the site's url" do
|
77
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
78
|
+
assert site.accept?("http://www.example.com/foo/bar")
|
79
|
+
end
|
80
|
+
|
81
|
+
test "#accept? should return false when specified url is outside the site's url" do
|
82
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
83
|
+
refute site.accept?("http://www.example.com/bar")
|
84
|
+
end
|
85
|
+
|
86
|
+
test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
|
87
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
88
|
+
to_return(:status => 200, :body => <<-TXT)
|
89
|
+
User-agent: #{Scrape.user_agent}
|
90
|
+
Disallow: /bar
|
91
|
+
TXT
|
92
|
+
|
93
|
+
site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
|
94
|
+
assert site.accept?("http://www.example.com/foo/bar")
|
95
|
+
end
|
96
|
+
|
97
|
+
test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
|
98
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
99
|
+
to_return(:status => 200, :body => <<-TXT)
|
100
|
+
User-agent: #{Scrape.user_agent}
|
101
|
+
Disallow: /foo
|
102
|
+
TXT
|
103
|
+
|
104
|
+
site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
|
105
|
+
refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
|
79
106
|
end
|
80
107
|
|
81
108
|
test "#normalize should return a url when string begins with a slash" do
|
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
|
|
84
111
|
end
|
85
112
|
|
86
113
|
test "#normalize should return a url with the string appended" do
|
87
|
-
site = Scrape::Site.new "http://www.example.com/foo"
|
114
|
+
site = Scrape::Site.new "http://www.example.com/foo/boo"
|
88
115
|
assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
|
89
116
|
end
|
90
117
|
|
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
|
|
92
119
|
site = Scrape::Site.new "http://www.example.com/foo"
|
93
120
|
assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
|
94
121
|
end
|
122
|
+
|
123
|
+
test "#normalize should return a url when string is a forward slash" do
|
124
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
125
|
+
assert_equal "http://www.example.com/", site.normalize("/")
|
126
|
+
end
|
127
|
+
|
128
|
+
test "#robots_txt should return a RobotsTxt instance from the site's url" do
|
129
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
130
|
+
to_return(:status => 200, :body => <<-TXT)
|
131
|
+
User-agent: Test
|
132
|
+
Disallow: /foo
|
133
|
+
TXT
|
134
|
+
|
135
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
136
|
+
robots = site.robots_txt
|
137
|
+
assert_kind_of Scrape::RobotsTxt, robots
|
138
|
+
end
|
95
139
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 1.5.5
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: addressable
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 2.2.8
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 2.2.8
|
30
46
|
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
31
47
|
email:
|
32
48
|
- evilmarty@gmail.com
|
@@ -50,7 +66,10 @@ files:
|
|
50
66
|
- lib/scrape/default_loader.rb
|
51
67
|
- lib/scrape/dsl.rb
|
52
68
|
- lib/scrape/match.rb
|
69
|
+
- lib/scrape/robots_txt.rb
|
70
|
+
- lib/scrape/robots_txt_rules.rb
|
53
71
|
- lib/scrape/site.rb
|
72
|
+
- lib/scrape/string_ext.rb
|
54
73
|
- lib/scrape/version.rb
|
55
74
|
- scrape.gemspec
|
56
75
|
- test/support/test1.scrape
|
@@ -60,6 +79,8 @@ files:
|
|
60
79
|
- test/unit/application_test.rb
|
61
80
|
- test/unit/default_loader_test.rb
|
62
81
|
- test/unit/match_test.rb
|
82
|
+
- test/unit/robots_txt_rules_test.rb
|
83
|
+
- test/unit/robots_txt_test.rb
|
63
84
|
- test/unit/scrape_test.rb
|
64
85
|
- test/unit/site_test.rb
|
65
86
|
homepage: http://github.com/evilmarty/scrape
|