scrape 0.1.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.1.1)
4
+ scrape (0.2)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -17,6 +17,7 @@ PLATFORMS
17
17
  ruby
18
18
 
19
19
  DEPENDENCIES
20
+ addressable (~> 2.2.8)
20
21
  nokogiri (~> 1.5.5)
21
22
  scrape!
22
23
  webmock (~> 1.8.7)
data/lib/scrape.rb CHANGED
@@ -3,6 +3,8 @@ require "logger"
3
3
  require "open-uri"
4
4
  require "bundler/setup"
5
5
 
6
+ require "scrape/string_ext.rb"
7
+
6
8
  module Scrape
7
9
  require 'scrape/version'
8
10
 
@@ -11,9 +13,10 @@ module Scrape
11
13
  autoload 'Match', 'scrape/match'
12
14
  autoload 'DefaultLoader', 'scrape/default_loader'
13
15
  autoload 'DSL', 'scrape/dsl'
14
- autoload 'URI', 'scrape/uri'
16
+ autoload 'RobotsTxt', 'scrape/robots_txt'
17
+ autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
15
18
 
16
- class ScrapeFileNotFound < Exception; end
19
+ class FileNotFound < Exception; end
17
20
 
18
21
  class << self
19
22
  attr_writer :user_agent
@@ -1,12 +1,13 @@
1
1
  class Scrape::Application
2
- attr_reader :scrapefile, :loader, :sites, :history
2
+ attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
3
3
 
4
- def initialize scrapefile, loader = Scrape::DefaultLoader.new
4
+ def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
5
5
  @scrapefile = File.expand_path scrapefile
6
6
  @loader = loader
7
7
  @sites = {}
8
8
  @queue = []
9
9
  @history = []
10
+ @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
10
11
  end
11
12
 
12
13
  def run
@@ -43,14 +44,21 @@ class Scrape::Application
43
44
  end
44
45
  end
45
46
 
47
+ def ignore_robots_txt= bool
48
+ sites.each{|_, site| site.ignore_robots_txt = bool }
49
+ @ignore_robots_txt = bool
50
+ end
51
+
46
52
  def [] url
47
53
  @sites.values.detect{|site| site.accept? url }
48
54
  end
49
55
 
50
56
  def load_scrapefile
51
57
  return if @scrapefile_loaded
52
- result = loader.load(scrapefile)
58
+ raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
59
+ result = loader.load scrapefile
53
60
  @sites.update result if result.is_a? Hash
61
+ self.ignore_robots_txt = ignore_robots_txt
54
62
  reset
55
63
  @scrapefile_loaded = true
56
64
  end
data/lib/scrape/cli.rb CHANGED
@@ -17,6 +17,9 @@ class Scrape::CLI
17
17
  opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
18
18
  options[:file] = File.expand_path file
19
19
  end
20
+ opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
21
+ options[:ignore_robots_txt] = true
22
+ end
20
23
  opts.on_tail "-h", "--help", "Show this message" do
21
24
  puts opts
22
25
  exit
@@ -28,12 +31,11 @@ class Scrape::CLI
28
31
  end
29
32
  opts.parse argv
30
33
 
31
- if File.exists? options[:file]
32
- Scrape::Application.new(options[:file]).run
33
- else
34
- puts "#{command} aborted!"
35
- puts "No Scrapefile found"
36
- exit -1
37
- end
34
+ Scrape::Application.new(options.delete(:file), options).run
35
+
36
+ rescue Scrape::FileNotFound
37
+ puts "#{command} aborted!"
38
+ puts "No Scrapefile found"
39
+ exit -1
38
40
  end
39
41
  end
@@ -0,0 +1,54 @@
1
+ require 'addressable/uri'
2
+
3
+ class Scrape::RobotsTxt
4
+ def initialize rules
5
+ @rules = rules
6
+ @rules.default = Scrape::RobotsTxtRules.new
7
+ end
8
+
9
+ def user_agents
10
+ @rules.keys
11
+ end
12
+
13
+ def disallows
14
+ @rules.values.flatten
15
+ end
16
+
17
+ def [] user_agent
18
+ rules = @rules[user_agent].clone
19
+ rules += @rules['*'] unless user_agent == '*'
20
+ rules
21
+ end
22
+
23
+ def =~ str
24
+ self[Scrape.user_agent] =~ str
25
+ end
26
+
27
+ def each &block
28
+ @rules.each &block
29
+ end
30
+
31
+ def self.parse content
32
+ rules, user_agent = Hash.new, nil
33
+
34
+ content.split("\n").each do |line|
35
+ case line
36
+ when /^#/
37
+ next
38
+ when /User-agent:\s*(.+)/
39
+ user_agent = $1.strip
40
+ rules.update user_agent => Scrape::RobotsTxtRules.new
41
+ when /Disallow:\s*(.+)/
42
+ rules[user_agent] << $1.strip
43
+ end
44
+ end
45
+
46
+ new rules
47
+ end
48
+
49
+ def self.load url, default = true
50
+ url = Addressable::URI.join(url, "/robots.txt") if default
51
+ parse Scrape.open(url)
52
+ end
53
+ public :load
54
+ end
@@ -0,0 +1,24 @@
1
+ class Scrape::RobotsTxtRules
2
+ def initialize *rules
3
+ @rules = rules.flatten
4
+ end
5
+
6
+ def << rule
7
+ @rules.push *Array(rule).flatten
8
+ self
9
+ end
10
+
11
+ def + ary
12
+ dup << ary.to_ary
13
+ end
14
+
15
+ def =~ str
16
+ str = str.to_str
17
+ @rules.any?{|rule| str.starts_with rule }
18
+ end
19
+
20
+ def to_a
21
+ @rules.dup
22
+ end
23
+ alias_method :to_ary, :to_a
24
+ end
data/lib/scrape/site.rb CHANGED
@@ -1,14 +1,16 @@
1
- require 'uri'
1
+ require 'addressable/uri'
2
2
  require 'nokogiri'
3
3
 
4
4
  class Scrape::Site
5
5
  attr_reader :url, :matches
6
+ attr_accessor :ignore_robots_txt
6
7
 
7
- def initialize url
8
- @url = URI.parse url
8
+ def initialize url, options = {}
9
+ @url = Addressable::URI.parse url
9
10
  @url.query = nil
10
11
  @url.fragment = nil
11
12
  @matches = []
13
+ @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
12
14
  end
13
15
 
14
16
  def add_match matcher, &proc
@@ -23,22 +25,33 @@ class Scrape::Site
23
25
 
24
26
  @matches.each{|match| match.invoke doc if match =~ url }
25
27
 
26
- doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
28
+ doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
27
29
  end
28
30
 
29
31
  def accept? url
30
- url.to_s[0, to_s.length] == to_s
32
+ url = normalize url
33
+ url.starts_with(to_s) && !disallowed?(url)
34
+ end
35
+
36
+ def normalize url, base_url = self.url
37
+ Addressable::URI.join(base_url, url).to_s
38
+ end
39
+
40
+ def robots_txt
41
+ @robots_txt ||= Scrape::RobotsTxt.load url
31
42
  end
32
43
 
33
- def normalize url
34
- case url
35
- when /^.+:\/\// then url.dup
36
- when /^\// then @url.merge(url).to_s
37
- else @url.merge("#{@url.path}/#{url}").to_s
38
- end
44
+ def ignore_robots_txt?
45
+ !!@ignore_robots_txt
39
46
  end
40
47
 
41
48
  def to_s
42
49
  url.to_s
43
50
  end
51
+
52
+ private
53
+
54
+ def disallowed? url
55
+ !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
56
+ end
44
57
  end
@@ -0,0 +1,6 @@
1
+ class String
2
+ def starts_with str
3
+ str = str.to_str
4
+ self[0, str.length] == str
5
+ end unless instance_methods.include?(:starts_with)
6
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.1.1' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.2' unless defined? ::Scrape::VERSION
3
3
  end
data/scrape.gemspec CHANGED
@@ -19,4 +19,5 @@ Gem::Specification.new do |s|
19
19
  s.require_paths = ["lib"]
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
+ s.add_development_dependency "addressable", "~> 2.2.8"
22
23
  end
@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
54
54
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
55
  test_loader = MiniTest::Mock.new
56
56
  test_loader.expect :load, nil, [filepath]
57
- Scrape::Application.new(filepath, test_loader).run
57
+ Scrape::Application.new(filepath, {}, test_loader).run
58
58
  assert test_loader.verify, "loader did not receive file"
59
59
  end
60
60
 
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
69
69
  3.times{ app.enqueue "http://example.com" }
70
70
  assert_equal ["http://example.com"], app.queue
71
71
  end
72
+
73
+ test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
74
+ site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
75
+ app = Scrape::Application.new(".")
76
+ app.sites.update site.to_s => site
77
+ assert_equal false, site.ignore_robots_txt
78
+ app.ignore_robots_txt = true
79
+ assert_equal true, site.ignore_robots_txt
80
+ end
72
81
  end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class RobotsTxtRulesTest < Scrape::TestCase
4
+ test "#initialize should set the rules passed as multiple arguments" do
5
+ rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
6
+ assert_equal ["/foo", "/bar"], rules.to_a
7
+ end
8
+
9
+ test "#initialize should set the rules passed an array argument" do
10
+ rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
11
+ assert_equal ["/foo", "/bar"], rules.to_a
12
+ end
13
+
14
+ test "#<< should append the string" do
15
+ rules = Scrape::RobotsTxtRules.new "/foo"
16
+ assert_equal ["/foo"], rules.to_a
17
+ rules << "/bar"
18
+ assert_equal ["/foo", "/bar"], rules.to_a
19
+ end
20
+
21
+ test "#<< should append the array" do
22
+ rules = Scrape::RobotsTxtRules.new "/foo"
23
+ assert_equal ["/foo"], rules.to_a
24
+ rules << ["/bar", "/too"]
25
+ assert_equal ["/foo", "/bar", "/too"], rules.to_a
26
+ end
27
+
28
+ test "#+ should return a new instance concatenating it self and the given array" do
29
+ rules1 = Scrape::RobotsTxtRules.new "/foo"
30
+ rules2 = rules1 + ["/bar"]
31
+ refute_equal rules1, rules2
32
+ assert_kind_of Scrape::RobotsTxtRules, rules2
33
+ assert_equal ["/foo", "/bar"], rules2.to_a
34
+ end
35
+
36
+ test "#=~ should match anything that beings with /" do
37
+ rules = Scrape::RobotsTxtRules.new "/"
38
+ assert rules =~ "/"
39
+ assert rules =~ "/foo"
40
+ end
41
+
42
+ test "#=~ should match anything that begins with rules" do
43
+ rules = Scrape::RobotsTxtRules.new "/foo"
44
+ assert rules =~ "/foo"
45
+ assert rules =~ "/foo/"
46
+ assert rules =~ "/foo/bar"
47
+ assert rules =~ "/foo.html"
48
+ refute rules =~ "/bar"
49
+ end
50
+ end
@@ -0,0 +1,71 @@
1
+ require "test_helper"
2
+
3
+ class RobotsTxtTest < Scrape::TestCase
4
+ test "#user_agents should return an array" do
5
+ robots = Scrape::RobotsTxt.new "Test" => []
6
+ assert_equal ["Test"], robots.user_agents
7
+ end
8
+
9
+ test "#disallows should return an array" do
10
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
11
+ assert_equal ["/foo"], robots.disallows
12
+ end
13
+
14
+ test "#[] should return all disallows for the specified user agent" do
15
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
16
+ assert_equal ["/foo"], robots["Test"]
17
+ end
18
+
19
+ test "#[] should return all disallows for the specified user agent including wildcard" do
20
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
21
+ assert_equal ["/foo", "/bar"], robots["Test"]
22
+ end
23
+
24
+ test "#[] should return all disallows for wildcard" do
25
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
26
+ assert_equal ["/bar"], robots["*"]
27
+ end
28
+
29
+ test ".parse should return new instance parsed from a string" do
30
+ robots = Scrape::RobotsTxt.parse <<-TXT
31
+ User-agent: Test
32
+ Disallow: /foo
33
+ Disallow: /bar
34
+ TXT
35
+
36
+ assert_equal ["Test"], robots.user_agents
37
+ assert_equal ["/foo", "/bar"], robots.disallows
38
+ end
39
+
40
+ test ".parse should return new empty instance" do
41
+ robots = Scrape::RobotsTxt.parse ""
42
+ assert_equal [], robots.user_agents
43
+ assert_equal [], robots.disallows
44
+ end
45
+
46
+ test ".load should return a new instance parsed from the specified url" do
47
+ stub_request(:get, "http://www.example.com/robots.txt").
48
+ to_return(:status => 200, :body => <<-TXT)
49
+ User-agent: Test
50
+ Disallow: /foo
51
+ Disallow: /bar
52
+ TXT
53
+
54
+ robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
55
+ assert_equal ["Test"], robots.user_agents
56
+ assert_equal ["/foo", "/bar"], robots.disallows
57
+ end
58
+
59
+ test ".load should return a new instance parsed from the specified url with the path defaulted" do
60
+ stub_request(:get, "http://www.example.com/robots.txt").
61
+ to_return(:status => 200, :body => <<-TXT)
62
+ User-agent: Test
63
+ Disallow: /foo
64
+ Disallow: /bar
65
+ TXT
66
+
67
+ robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
68
+ assert_equal ["Test"], robots.user_agents
69
+ assert_equal ["/foo", "/bar"], robots.disallows
70
+ end
71
+ end
@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
24
24
  assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
25
25
  end
26
26
 
27
- test "#parse should return relative urls to the site" do
28
- stub_request(:get, "http://www.example.com/test").
27
+ test "#parse should return relative urls to the specified url" do
28
+ stub_request(:get, "http://www.example.com/foo/bar").
29
29
  with(:headers => {"User-Agent" => Scrape.user_agent}).
30
30
  to_return(:status => 200, :body => <<-HTML)
31
31
  <html>
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
36
36
  HTML
37
37
 
38
38
  site = Scrape::Site.new "http://www.example.com"
39
- assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
39
+ assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
40
40
  end
41
41
 
42
42
  test "#parse should return no urls" do
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
73
73
  assert ok, "Match was not invoked"
74
74
  end
75
75
 
76
- test "#accept? should return true when specified url inside the site's url" do
77
- uri = Scrape::Site.new "http://www.example.com/foo"
78
- assert uri.accept?("http://www.example.com/foo/bar")
76
+ test "#accept? should return true when specified url is inside the site's url" do
77
+ site = Scrape::Site.new "http://www.example.com/foo"
78
+ assert site.accept?("http://www.example.com/foo/bar")
79
+ end
80
+
81
+ test "#accept? should return false when specified url is outside the site's url" do
82
+ site = Scrape::Site.new "http://www.example.com/foo"
83
+ refute site.accept?("http://www.example.com/bar")
84
+ end
85
+
86
+ test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
87
+ stub_request(:get, "http://www.example.com/robots.txt").
88
+ to_return(:status => 200, :body => <<-TXT)
89
+ User-agent: #{Scrape.user_agent}
90
+ Disallow: /bar
91
+ TXT
92
+
93
+ site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
94
+ assert site.accept?("http://www.example.com/foo/bar")
95
+ end
96
+
97
+ test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
98
+ stub_request(:get, "http://www.example.com/robots.txt").
99
+ to_return(:status => 200, :body => <<-TXT)
100
+ User-agent: #{Scrape.user_agent}
101
+ Disallow: /foo
102
+ TXT
103
+
104
+ site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
105
+ refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
79
106
  end
80
107
 
81
108
  test "#normalize should return a url when string begins with a slash" do
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
84
111
  end
85
112
 
86
113
  test "#normalize should return a url with the string appended" do
87
- site = Scrape::Site.new "http://www.example.com/foo"
114
+ site = Scrape::Site.new "http://www.example.com/foo/boo"
88
115
  assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
89
116
  end
90
117
 
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
92
119
  site = Scrape::Site.new "http://www.example.com/foo"
93
120
  assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
94
121
  end
122
+
123
+ test "#normalize should return a url when string is a forward slash" do
124
+ site = Scrape::Site.new "http://www.example.com/foo"
125
+ assert_equal "http://www.example.com/", site.normalize("/")
126
+ end
127
+
128
+ test "#robots_txt should return a RobotsTxt instance from the site's url" do
129
+ stub_request(:get, "http://www.example.com/robots.txt").
130
+ to_return(:status => 200, :body => <<-TXT)
131
+ User-agent: Test
132
+ Disallow: /foo
133
+ TXT
134
+
135
+ site = Scrape::Site.new "http://www.example.com/foo"
136
+ robots = site.robots_txt
137
+ assert_kind_of Scrape::RobotsTxt, robots
138
+ end
95
139
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-11 00:00:00.000000000 Z
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: 1.5.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: addressable
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 2.2.8
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 2.2.8
30
46
  description: An easy to use utility to scrape websites using a DSL similar to rake.
31
47
  email:
32
48
  - evilmarty@gmail.com
@@ -50,7 +66,10 @@ files:
50
66
  - lib/scrape/default_loader.rb
51
67
  - lib/scrape/dsl.rb
52
68
  - lib/scrape/match.rb
69
+ - lib/scrape/robots_txt.rb
70
+ - lib/scrape/robots_txt_rules.rb
53
71
  - lib/scrape/site.rb
72
+ - lib/scrape/string_ext.rb
54
73
  - lib/scrape/version.rb
55
74
  - scrape.gemspec
56
75
  - test/support/test1.scrape
@@ -60,6 +79,8 @@ files:
60
79
  - test/unit/application_test.rb
61
80
  - test/unit/default_loader_test.rb
62
81
  - test/unit/match_test.rb
82
+ - test/unit/robots_txt_rules_test.rb
83
+ - test/unit/robots_txt_test.rb
63
84
  - test/unit/scrape_test.rb
64
85
  - test/unit/site_test.rb
65
86
  homepage: http://github.com/evilmarty/scrape