scrape 0.1.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +2 -1
- data/lib/scrape.rb +5 -2
- data/lib/scrape/application.rb +11 -3
- data/lib/scrape/cli.rb +9 -7
- data/lib/scrape/robots_txt.rb +54 -0
- data/lib/scrape/robots_txt_rules.rb +24 -0
- data/lib/scrape/site.rb +24 -11
- data/lib/scrape/string_ext.rb +6 -0
- data/lib/scrape/version.rb +1 -1
- data/scrape.gemspec +1 -0
- data/test/unit/application_test.rb +10 -1
- data/test/unit/robots_txt_rules_test.rb +50 -0
- data/test/unit/robots_txt_test.rb +71 -0
- data/test/unit/site_test.rb +51 -7
- metadata +23 -2
data/Gemfile.lock
CHANGED
data/lib/scrape.rb
CHANGED
@@ -3,6 +3,8 @@ require "logger"
|
|
3
3
|
require "open-uri"
|
4
4
|
require "bundler/setup"
|
5
5
|
|
6
|
+
require "scrape/string_ext.rb"
|
7
|
+
|
6
8
|
module Scrape
|
7
9
|
require 'scrape/version'
|
8
10
|
|
@@ -11,9 +13,10 @@ module Scrape
|
|
11
13
|
autoload 'Match', 'scrape/match'
|
12
14
|
autoload 'DefaultLoader', 'scrape/default_loader'
|
13
15
|
autoload 'DSL', 'scrape/dsl'
|
14
|
-
autoload '
|
16
|
+
autoload 'RobotsTxt', 'scrape/robots_txt'
|
17
|
+
autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
|
15
18
|
|
16
|
-
class
|
19
|
+
class FileNotFound < Exception; end
|
17
20
|
|
18
21
|
class << self
|
19
22
|
attr_writer :user_agent
|
data/lib/scrape/application.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
class Scrape::Application
|
2
|
-
attr_reader :scrapefile, :loader, :sites, :history
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
|
3
3
|
|
4
|
-
def initialize scrapefile, loader = Scrape::DefaultLoader.new
|
4
|
+
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
6
|
@loader = loader
|
7
7
|
@sites = {}
|
8
8
|
@queue = []
|
9
9
|
@history = []
|
10
|
+
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
|
10
11
|
end
|
11
12
|
|
12
13
|
def run
|
@@ -43,14 +44,21 @@ class Scrape::Application
|
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
47
|
+
def ignore_robots_txt= bool
|
48
|
+
sites.each{|_, site| site.ignore_robots_txt = bool }
|
49
|
+
@ignore_robots_txt = bool
|
50
|
+
end
|
51
|
+
|
46
52
|
def [] url
|
47
53
|
@sites.values.detect{|site| site.accept? url }
|
48
54
|
end
|
49
55
|
|
50
56
|
def load_scrapefile
|
51
57
|
return if @scrapefile_loaded
|
52
|
-
|
58
|
+
raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
|
59
|
+
result = loader.load scrapefile
|
53
60
|
@sites.update result if result.is_a? Hash
|
61
|
+
self.ignore_robots_txt = ignore_robots_txt
|
54
62
|
reset
|
55
63
|
@scrapefile_loaded = true
|
56
64
|
end
|
data/lib/scrape/cli.rb
CHANGED
@@ -17,6 +17,9 @@ class Scrape::CLI
|
|
17
17
|
opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
|
18
18
|
options[:file] = File.expand_path file
|
19
19
|
end
|
20
|
+
opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
|
21
|
+
options[:ignore_robots_txt] = true
|
22
|
+
end
|
20
23
|
opts.on_tail "-h", "--help", "Show this message" do
|
21
24
|
puts opts
|
22
25
|
exit
|
@@ -28,12 +31,11 @@ class Scrape::CLI
|
|
28
31
|
end
|
29
32
|
opts.parse argv
|
30
33
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
end
|
34
|
+
Scrape::Application.new(options.delete(:file), options).run
|
35
|
+
|
36
|
+
rescue Scrape::FileNotFound
|
37
|
+
puts "#{command} aborted!"
|
38
|
+
puts "No Scrapefile found"
|
39
|
+
exit -1
|
38
40
|
end
|
39
41
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class Scrape::RobotsTxt
|
4
|
+
def initialize rules
|
5
|
+
@rules = rules
|
6
|
+
@rules.default = Scrape::RobotsTxtRules.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def user_agents
|
10
|
+
@rules.keys
|
11
|
+
end
|
12
|
+
|
13
|
+
def disallows
|
14
|
+
@rules.values.flatten
|
15
|
+
end
|
16
|
+
|
17
|
+
def [] user_agent
|
18
|
+
rules = @rules[user_agent].clone
|
19
|
+
rules += @rules['*'] unless user_agent == '*'
|
20
|
+
rules
|
21
|
+
end
|
22
|
+
|
23
|
+
def =~ str
|
24
|
+
self[Scrape.user_agent] =~ str
|
25
|
+
end
|
26
|
+
|
27
|
+
def each &block
|
28
|
+
@rules.each &block
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.parse content
|
32
|
+
rules, user_agent = Hash.new, nil
|
33
|
+
|
34
|
+
content.split("\n").each do |line|
|
35
|
+
case line
|
36
|
+
when /^#/
|
37
|
+
next
|
38
|
+
when /User-agent:\s*(.+)/
|
39
|
+
user_agent = $1.strip
|
40
|
+
rules.update user_agent => Scrape::RobotsTxtRules.new
|
41
|
+
when /Disallow:\s*(.+)/
|
42
|
+
rules[user_agent] << $1.strip
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
new rules
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.load url, default = true
|
50
|
+
url = Addressable::URI.join(url, "/robots.txt") if default
|
51
|
+
parse Scrape.open(url)
|
52
|
+
end
|
53
|
+
public :load
|
54
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Scrape::RobotsTxtRules
|
2
|
+
def initialize *rules
|
3
|
+
@rules = rules.flatten
|
4
|
+
end
|
5
|
+
|
6
|
+
def << rule
|
7
|
+
@rules.push *Array(rule).flatten
|
8
|
+
self
|
9
|
+
end
|
10
|
+
|
11
|
+
def + ary
|
12
|
+
dup << ary.to_ary
|
13
|
+
end
|
14
|
+
|
15
|
+
def =~ str
|
16
|
+
str = str.to_str
|
17
|
+
@rules.any?{|rule| str.starts_with rule }
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_a
|
21
|
+
@rules.dup
|
22
|
+
end
|
23
|
+
alias_method :to_ary, :to_a
|
24
|
+
end
|
data/lib/scrape/site.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
require 'uri'
|
1
|
+
require 'addressable/uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
class Scrape::Site
|
5
5
|
attr_reader :url, :matches
|
6
|
+
attr_accessor :ignore_robots_txt
|
6
7
|
|
7
|
-
def initialize url
|
8
|
-
@url = URI.parse url
|
8
|
+
def initialize url, options = {}
|
9
|
+
@url = Addressable::URI.parse url
|
9
10
|
@url.query = nil
|
10
11
|
@url.fragment = nil
|
11
12
|
@matches = []
|
13
|
+
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
|
12
14
|
end
|
13
15
|
|
14
16
|
def add_match matcher, &proc
|
@@ -23,22 +25,33 @@ class Scrape::Site
|
|
23
25
|
|
24
26
|
@matches.each{|match| match.invoke doc if match =~ url }
|
25
27
|
|
26
|
-
doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
|
28
|
+
doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
|
27
29
|
end
|
28
30
|
|
29
31
|
def accept? url
|
30
|
-
url
|
32
|
+
url = normalize url
|
33
|
+
url.starts_with(to_s) && !disallowed?(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
def normalize url, base_url = self.url
|
37
|
+
Addressable::URI.join(base_url, url).to_s
|
38
|
+
end
|
39
|
+
|
40
|
+
def robots_txt
|
41
|
+
@robots_txt ||= Scrape::RobotsTxt.load url
|
31
42
|
end
|
32
43
|
|
33
|
-
def
|
34
|
-
|
35
|
-
when /^.+:\/\// then url.dup
|
36
|
-
when /^\// then @url.merge(url).to_s
|
37
|
-
else @url.merge("#{@url.path}/#{url}").to_s
|
38
|
-
end
|
44
|
+
def ignore_robots_txt?
|
45
|
+
!!@ignore_robots_txt
|
39
46
|
end
|
40
47
|
|
41
48
|
def to_s
|
42
49
|
url.to_s
|
43
50
|
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def disallowed? url
|
55
|
+
!ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
|
56
|
+
end
|
44
57
|
end
|
data/lib/scrape/version.rb
CHANGED
data/scrape.gemspec
CHANGED
@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
|
|
54
54
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
55
55
|
test_loader = MiniTest::Mock.new
|
56
56
|
test_loader.expect :load, nil, [filepath]
|
57
|
-
Scrape::Application.new(filepath, test_loader).run
|
57
|
+
Scrape::Application.new(filepath, {}, test_loader).run
|
58
58
|
assert test_loader.verify, "loader did not receive file"
|
59
59
|
end
|
60
60
|
|
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
|
|
69
69
|
3.times{ app.enqueue "http://example.com" }
|
70
70
|
assert_equal ["http://example.com"], app.queue
|
71
71
|
end
|
72
|
+
|
73
|
+
test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
|
74
|
+
site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
|
75
|
+
app = Scrape::Application.new(".")
|
76
|
+
app.sites.update site.to_s => site
|
77
|
+
assert_equal false, site.ignore_robots_txt
|
78
|
+
app.ignore_robots_txt = true
|
79
|
+
assert_equal true, site.ignore_robots_txt
|
80
|
+
end
|
72
81
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class RobotsTxtRulesTest < Scrape::TestCase
|
4
|
+
test "#initialize should set the rules passed as multiple arguments" do
|
5
|
+
rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
|
6
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
7
|
+
end
|
8
|
+
|
9
|
+
test "#initialize should set the rules passed an array argument" do
|
10
|
+
rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
|
11
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
12
|
+
end
|
13
|
+
|
14
|
+
test "#<< should append the string" do
|
15
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
16
|
+
assert_equal ["/foo"], rules.to_a
|
17
|
+
rules << "/bar"
|
18
|
+
assert_equal ["/foo", "/bar"], rules.to_a
|
19
|
+
end
|
20
|
+
|
21
|
+
test "#<< should append the array" do
|
22
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
23
|
+
assert_equal ["/foo"], rules.to_a
|
24
|
+
rules << ["/bar", "/too"]
|
25
|
+
assert_equal ["/foo", "/bar", "/too"], rules.to_a
|
26
|
+
end
|
27
|
+
|
28
|
+
test "#+ should return a new instance concatenating it self and the given array" do
|
29
|
+
rules1 = Scrape::RobotsTxtRules.new "/foo"
|
30
|
+
rules2 = rules1 + ["/bar"]
|
31
|
+
refute_equal rules1, rules2
|
32
|
+
assert_kind_of Scrape::RobotsTxtRules, rules2
|
33
|
+
assert_equal ["/foo", "/bar"], rules2.to_a
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#=~ should match anything that beings with /" do
|
37
|
+
rules = Scrape::RobotsTxtRules.new "/"
|
38
|
+
assert rules =~ "/"
|
39
|
+
assert rules =~ "/foo"
|
40
|
+
end
|
41
|
+
|
42
|
+
test "#=~ should match anything that begins with rules" do
|
43
|
+
rules = Scrape::RobotsTxtRules.new "/foo"
|
44
|
+
assert rules =~ "/foo"
|
45
|
+
assert rules =~ "/foo/"
|
46
|
+
assert rules =~ "/foo/bar"
|
47
|
+
assert rules =~ "/foo.html"
|
48
|
+
refute rules =~ "/bar"
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class RobotsTxtTest < Scrape::TestCase
|
4
|
+
test "#user_agents should return an array" do
|
5
|
+
robots = Scrape::RobotsTxt.new "Test" => []
|
6
|
+
assert_equal ["Test"], robots.user_agents
|
7
|
+
end
|
8
|
+
|
9
|
+
test "#disallows should return an array" do
|
10
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
|
11
|
+
assert_equal ["/foo"], robots.disallows
|
12
|
+
end
|
13
|
+
|
14
|
+
test "#[] should return all disallows for the specified user agent" do
|
15
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
|
16
|
+
assert_equal ["/foo"], robots["Test"]
|
17
|
+
end
|
18
|
+
|
19
|
+
test "#[] should return all disallows for the specified user agent including wildcard" do
|
20
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
|
21
|
+
assert_equal ["/foo", "/bar"], robots["Test"]
|
22
|
+
end
|
23
|
+
|
24
|
+
test "#[] should return all disallows for wildcard" do
|
25
|
+
robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
|
26
|
+
assert_equal ["/bar"], robots["*"]
|
27
|
+
end
|
28
|
+
|
29
|
+
test ".parse should return new instance parsed from a string" do
|
30
|
+
robots = Scrape::RobotsTxt.parse <<-TXT
|
31
|
+
User-agent: Test
|
32
|
+
Disallow: /foo
|
33
|
+
Disallow: /bar
|
34
|
+
TXT
|
35
|
+
|
36
|
+
assert_equal ["Test"], robots.user_agents
|
37
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
38
|
+
end
|
39
|
+
|
40
|
+
test ".parse should return new empty instance" do
|
41
|
+
robots = Scrape::RobotsTxt.parse ""
|
42
|
+
assert_equal [], robots.user_agents
|
43
|
+
assert_equal [], robots.disallows
|
44
|
+
end
|
45
|
+
|
46
|
+
test ".load should return a new instance parsed from the specified url" do
|
47
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
48
|
+
to_return(:status => 200, :body => <<-TXT)
|
49
|
+
User-agent: Test
|
50
|
+
Disallow: /foo
|
51
|
+
Disallow: /bar
|
52
|
+
TXT
|
53
|
+
|
54
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
|
55
|
+
assert_equal ["Test"], robots.user_agents
|
56
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
57
|
+
end
|
58
|
+
|
59
|
+
test ".load should return a new instance parsed from the specified url with the path defaulted" do
|
60
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
61
|
+
to_return(:status => 200, :body => <<-TXT)
|
62
|
+
User-agent: Test
|
63
|
+
Disallow: /foo
|
64
|
+
Disallow: /bar
|
65
|
+
TXT
|
66
|
+
|
67
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
|
68
|
+
assert_equal ["Test"], robots.user_agents
|
69
|
+
assert_equal ["/foo", "/bar"], robots.disallows
|
70
|
+
end
|
71
|
+
end
|
data/test/unit/site_test.rb
CHANGED
@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
|
|
24
24
|
assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
|
25
25
|
end
|
26
26
|
|
27
|
-
test "#parse should return relative urls to the
|
28
|
-
stub_request(:get, "http://www.example.com/
|
27
|
+
test "#parse should return relative urls to the specified url" do
|
28
|
+
stub_request(:get, "http://www.example.com/foo/bar").
|
29
29
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
30
30
|
to_return(:status => 200, :body => <<-HTML)
|
31
31
|
<html>
|
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
|
|
36
36
|
HTML
|
37
37
|
|
38
38
|
site = Scrape::Site.new "http://www.example.com"
|
39
|
-
assert_equal ["http://www.example.com/link1.html"], site.parse("/
|
39
|
+
assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
|
40
40
|
end
|
41
41
|
|
42
42
|
test "#parse should return no urls" do
|
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
|
|
73
73
|
assert ok, "Match was not invoked"
|
74
74
|
end
|
75
75
|
|
76
|
-
test "#accept? should return true when specified url inside the site's url" do
|
77
|
-
|
78
|
-
assert
|
76
|
+
test "#accept? should return true when specified url is inside the site's url" do
|
77
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
78
|
+
assert site.accept?("http://www.example.com/foo/bar")
|
79
|
+
end
|
80
|
+
|
81
|
+
test "#accept? should return false when specified url is outside the site's url" do
|
82
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
83
|
+
refute site.accept?("http://www.example.com/bar")
|
84
|
+
end
|
85
|
+
|
86
|
+
test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
|
87
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
88
|
+
to_return(:status => 200, :body => <<-TXT)
|
89
|
+
User-agent: #{Scrape.user_agent}
|
90
|
+
Disallow: /bar
|
91
|
+
TXT
|
92
|
+
|
93
|
+
site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
|
94
|
+
assert site.accept?("http://www.example.com/foo/bar")
|
95
|
+
end
|
96
|
+
|
97
|
+
test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
|
98
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
99
|
+
to_return(:status => 200, :body => <<-TXT)
|
100
|
+
User-agent: #{Scrape.user_agent}
|
101
|
+
Disallow: /foo
|
102
|
+
TXT
|
103
|
+
|
104
|
+
site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
|
105
|
+
refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
|
79
106
|
end
|
80
107
|
|
81
108
|
test "#normalize should return a url when string begins with a slash" do
|
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
|
|
84
111
|
end
|
85
112
|
|
86
113
|
test "#normalize should return a url with the string appended" do
|
87
|
-
site = Scrape::Site.new "http://www.example.com/foo"
|
114
|
+
site = Scrape::Site.new "http://www.example.com/foo/boo"
|
88
115
|
assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
|
89
116
|
end
|
90
117
|
|
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
|
|
92
119
|
site = Scrape::Site.new "http://www.example.com/foo"
|
93
120
|
assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
|
94
121
|
end
|
122
|
+
|
123
|
+
test "#normalize should return a url when string is a forward slash" do
|
124
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
125
|
+
assert_equal "http://www.example.com/", site.normalize("/")
|
126
|
+
end
|
127
|
+
|
128
|
+
test "#robots_txt should return a RobotsTxt instance from the site's url" do
|
129
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
130
|
+
to_return(:status => 200, :body => <<-TXT)
|
131
|
+
User-agent: Test
|
132
|
+
Disallow: /foo
|
133
|
+
TXT
|
134
|
+
|
135
|
+
site = Scrape::Site.new "http://www.example.com/foo"
|
136
|
+
robots = site.robots_txt
|
137
|
+
assert_kind_of Scrape::RobotsTxt, robots
|
138
|
+
end
|
95
139
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 1.5.5
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: addressable
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 2.2.8
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 2.2.8
|
30
46
|
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
31
47
|
email:
|
32
48
|
- evilmarty@gmail.com
|
@@ -50,7 +66,10 @@ files:
|
|
50
66
|
- lib/scrape/default_loader.rb
|
51
67
|
- lib/scrape/dsl.rb
|
52
68
|
- lib/scrape/match.rb
|
69
|
+
- lib/scrape/robots_txt.rb
|
70
|
+
- lib/scrape/robots_txt_rules.rb
|
53
71
|
- lib/scrape/site.rb
|
72
|
+
- lib/scrape/string_ext.rb
|
54
73
|
- lib/scrape/version.rb
|
55
74
|
- scrape.gemspec
|
56
75
|
- test/support/test1.scrape
|
@@ -60,6 +79,8 @@ files:
|
|
60
79
|
- test/unit/application_test.rb
|
61
80
|
- test/unit/default_loader_test.rb
|
62
81
|
- test/unit/match_test.rb
|
82
|
+
- test/unit/robots_txt_rules_test.rb
|
83
|
+
- test/unit/robots_txt_test.rb
|
63
84
|
- test/unit/scrape_test.rb
|
64
85
|
- test/unit/site_test.rb
|
65
86
|
homepage: http://github.com/evilmarty/scrape
|