scrape 0.1.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.1.1)
4
+ scrape (0.2)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -17,6 +17,7 @@ PLATFORMS
17
17
  ruby
18
18
 
19
19
  DEPENDENCIES
20
+ addressable (~> 2.2.8)
20
21
  nokogiri (~> 1.5.5)
21
22
  scrape!
22
23
  webmock (~> 1.8.7)
data/lib/scrape.rb CHANGED
@@ -3,6 +3,8 @@ require "logger"
3
3
  require "open-uri"
4
4
  require "bundler/setup"
5
5
 
6
+ require "scrape/string_ext.rb"
7
+
6
8
  module Scrape
7
9
  require 'scrape/version'
8
10
 
@@ -11,9 +13,10 @@ module Scrape
11
13
  autoload 'Match', 'scrape/match'
12
14
  autoload 'DefaultLoader', 'scrape/default_loader'
13
15
  autoload 'DSL', 'scrape/dsl'
14
- autoload 'URI', 'scrape/uri'
16
+ autoload 'RobotsTxt', 'scrape/robots_txt'
17
+ autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
15
18
 
16
- class ScrapeFileNotFound < Exception; end
19
+ class FileNotFound < Exception; end
17
20
 
18
21
  class << self
19
22
  attr_writer :user_agent
@@ -1,12 +1,13 @@
1
1
  class Scrape::Application
2
- attr_reader :scrapefile, :loader, :sites, :history
2
+ attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
3
3
 
4
- def initialize scrapefile, loader = Scrape::DefaultLoader.new
4
+ def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
5
5
  @scrapefile = File.expand_path scrapefile
6
6
  @loader = loader
7
7
  @sites = {}
8
8
  @queue = []
9
9
  @history = []
10
+ @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
10
11
  end
11
12
 
12
13
  def run
@@ -43,14 +44,21 @@ class Scrape::Application
43
44
  end
44
45
  end
45
46
 
47
+ def ignore_robots_txt= bool
48
+ sites.each{|_, site| site.ignore_robots_txt = bool }
49
+ @ignore_robots_txt = bool
50
+ end
51
+
46
52
  def [] url
47
53
  @sites.values.detect{|site| site.accept? url }
48
54
  end
49
55
 
50
56
  def load_scrapefile
51
57
  return if @scrapefile_loaded
52
- result = loader.load(scrapefile)
58
+ raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
59
+ result = loader.load scrapefile
53
60
  @sites.update result if result.is_a? Hash
61
+ self.ignore_robots_txt = ignore_robots_txt
54
62
  reset
55
63
  @scrapefile_loaded = true
56
64
  end
data/lib/scrape/cli.rb CHANGED
@@ -17,6 +17,9 @@ class Scrape::CLI
17
17
  opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
18
18
  options[:file] = File.expand_path file
19
19
  end
20
+ opts.on "-i", "--ignore-robots-txt", "Ignore robots.txt" do
21
+ options[:ignore_robots_txt] = true
22
+ end
20
23
  opts.on_tail "-h", "--help", "Show this message" do
21
24
  puts opts
22
25
  exit
@@ -28,12 +31,11 @@ class Scrape::CLI
28
31
  end
29
32
  opts.parse argv
30
33
 
31
- if File.exists? options[:file]
32
- Scrape::Application.new(options[:file]).run
33
- else
34
- puts "#{command} aborted!"
35
- puts "No Scrapefile found"
36
- exit -1
37
- end
34
+ Scrape::Application.new(options.delete(:file), options).run
35
+
36
+ rescue Scrape::FileNotFound
37
+ puts "#{command} aborted!"
38
+ puts "No Scrapefile found"
39
+ exit -1
38
40
  end
39
41
  end
@@ -0,0 +1,54 @@
1
+ require 'addressable/uri'
2
+
3
+ class Scrape::RobotsTxt
4
+ def initialize rules
5
+ @rules = rules
6
+ @rules.default = Scrape::RobotsTxtRules.new
7
+ end
8
+
9
+ def user_agents
10
+ @rules.keys
11
+ end
12
+
13
+ def disallows
14
+ @rules.values.flatten
15
+ end
16
+
17
+ def [] user_agent
18
+ rules = @rules[user_agent].clone
19
+ rules += @rules['*'] unless user_agent == '*'
20
+ rules
21
+ end
22
+
23
+ def =~ str
24
+ self[Scrape.user_agent] =~ str
25
+ end
26
+
27
+ def each &block
28
+ @rules.each &block
29
+ end
30
+
31
+ def self.parse content
32
+ rules, user_agent = Hash.new, nil
33
+
34
+ content.split("\n").each do |line|
35
+ case line
36
+ when /^#/
37
+ next
38
+ when /User-agent:\s*(.+)/
39
+ user_agent = $1.strip
40
+ rules.update user_agent => Scrape::RobotsTxtRules.new
41
+ when /Disallow:\s*(.+)/
42
+ rules[user_agent] << $1.strip
43
+ end
44
+ end
45
+
46
+ new rules
47
+ end
48
+
49
+ def self.load url, default = true
50
+ url = Addressable::URI.join(url, "/robots.txt") if default
51
+ parse Scrape.open(url)
52
+ end
53
+ public :load
54
+ end
@@ -0,0 +1,24 @@
1
+ class Scrape::RobotsTxtRules
2
+ def initialize *rules
3
+ @rules = rules.flatten
4
+ end
5
+
6
+ def << rule
7
+ @rules.push *Array(rule).flatten
8
+ self
9
+ end
10
+
11
+ def + ary
12
+ dup << ary.to_ary
13
+ end
14
+
15
+ def =~ str
16
+ str = str.to_str
17
+ @rules.any?{|rule| str.starts_with rule }
18
+ end
19
+
20
+ def to_a
21
+ @rules.dup
22
+ end
23
+ alias_method :to_ary, :to_a
24
+ end
data/lib/scrape/site.rb CHANGED
@@ -1,14 +1,16 @@
1
- require 'uri'
1
+ require 'addressable/uri'
2
2
  require 'nokogiri'
3
3
 
4
4
  class Scrape::Site
5
5
  attr_reader :url, :matches
6
+ attr_accessor :ignore_robots_txt
6
7
 
7
- def initialize url
8
- @url = URI.parse url
8
+ def initialize url, options = {}
9
+ @url = Addressable::URI.parse url
9
10
  @url.query = nil
10
11
  @url.fragment = nil
11
12
  @matches = []
13
+ @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
12
14
  end
13
15
 
14
16
  def add_match matcher, &proc
@@ -23,22 +25,33 @@ class Scrape::Site
23
25
 
24
26
  @matches.each{|match| match.invoke doc if match =~ url }
25
27
 
26
- doc.css("a[href]").map{|node| normalize node['href'] }.select{|url| accept? url }
28
+ doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
27
29
  end
28
30
 
29
31
  def accept? url
30
- url.to_s[0, to_s.length] == to_s
32
+ url = normalize url
33
+ url.starts_with(to_s) && !disallowed?(url)
34
+ end
35
+
36
+ def normalize url, base_url = self.url
37
+ Addressable::URI.join(base_url, url).to_s
38
+ end
39
+
40
+ def robots_txt
41
+ @robots_txt ||= Scrape::RobotsTxt.load url
31
42
  end
32
43
 
33
- def normalize url
34
- case url
35
- when /^.+:\/\// then url.dup
36
- when /^\// then @url.merge(url).to_s
37
- else @url.merge("#{@url.path}/#{url}").to_s
38
- end
44
+ def ignore_robots_txt?
45
+ !!@ignore_robots_txt
39
46
  end
40
47
 
41
48
  def to_s
42
49
  url.to_s
43
50
  end
51
+
52
+ private
53
+
54
+ def disallowed? url
55
+ !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
56
+ end
44
57
  end
@@ -0,0 +1,6 @@
1
+ class String
2
+ def starts_with str
3
+ str = str.to_str
4
+ self[0, str.length] == str
5
+ end unless instance_methods.include?(:starts_with)
6
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.1.1' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.2' unless defined? ::Scrape::VERSION
3
3
  end
data/scrape.gemspec CHANGED
@@ -19,4 +19,5 @@ Gem::Specification.new do |s|
19
19
  s.require_paths = ["lib"]
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
+ s.add_development_dependency "addressable", "~> 2.2.8"
22
23
  end
@@ -54,7 +54,7 @@ class ApplicationTest < Scrape::TestCase
54
54
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
55
  test_loader = MiniTest::Mock.new
56
56
  test_loader.expect :load, nil, [filepath]
57
- Scrape::Application.new(filepath, test_loader).run
57
+ Scrape::Application.new(filepath, {}, test_loader).run
58
58
  assert test_loader.verify, "loader did not receive file"
59
59
  end
60
60
 
@@ -69,4 +69,13 @@ class ApplicationTest < Scrape::TestCase
69
69
  3.times{ app.enqueue "http://example.com" }
70
70
  assert_equal ["http://example.com"], app.queue
71
71
  end
72
+
73
+ test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
74
+ site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
75
+ app = Scrape::Application.new(".")
76
+ app.sites.update site.to_s => site
77
+ assert_equal false, site.ignore_robots_txt
78
+ app.ignore_robots_txt = true
79
+ assert_equal true, site.ignore_robots_txt
80
+ end
72
81
  end
@@ -0,0 +1,50 @@
1
+ require "test_helper"
2
+
3
+ class RobotsTxtRulesTest < Scrape::TestCase
4
+ test "#initialize should set the rules passed as multiple arguments" do
5
+ rules = Scrape::RobotsTxtRules.new "/foo", "/bar"
6
+ assert_equal ["/foo", "/bar"], rules.to_a
7
+ end
8
+
9
+ test "#initialize should set the rules passed an array argument" do
10
+ rules = Scrape::RobotsTxtRules.new ["/foo", "/bar"]
11
+ assert_equal ["/foo", "/bar"], rules.to_a
12
+ end
13
+
14
+ test "#<< should append the string" do
15
+ rules = Scrape::RobotsTxtRules.new "/foo"
16
+ assert_equal ["/foo"], rules.to_a
17
+ rules << "/bar"
18
+ assert_equal ["/foo", "/bar"], rules.to_a
19
+ end
20
+
21
+ test "#<< should append the array" do
22
+ rules = Scrape::RobotsTxtRules.new "/foo"
23
+ assert_equal ["/foo"], rules.to_a
24
+ rules << ["/bar", "/too"]
25
+ assert_equal ["/foo", "/bar", "/too"], rules.to_a
26
+ end
27
+
28
+ test "#+ should return a new instance concatenating it self and the given array" do
29
+ rules1 = Scrape::RobotsTxtRules.new "/foo"
30
+ rules2 = rules1 + ["/bar"]
31
+ refute_equal rules1, rules2
32
+ assert_kind_of Scrape::RobotsTxtRules, rules2
33
+ assert_equal ["/foo", "/bar"], rules2.to_a
34
+ end
35
+
36
+ test "#=~ should match anything that beings with /" do
37
+ rules = Scrape::RobotsTxtRules.new "/"
38
+ assert rules =~ "/"
39
+ assert rules =~ "/foo"
40
+ end
41
+
42
+ test "#=~ should match anything that begins with rules" do
43
+ rules = Scrape::RobotsTxtRules.new "/foo"
44
+ assert rules =~ "/foo"
45
+ assert rules =~ "/foo/"
46
+ assert rules =~ "/foo/bar"
47
+ assert rules =~ "/foo.html"
48
+ refute rules =~ "/bar"
49
+ end
50
+ end
@@ -0,0 +1,71 @@
1
+ require "test_helper"
2
+
3
+ class RobotsTxtTest < Scrape::TestCase
4
+ test "#user_agents should return an array" do
5
+ robots = Scrape::RobotsTxt.new "Test" => []
6
+ assert_equal ["Test"], robots.user_agents
7
+ end
8
+
9
+ test "#disallows should return an array" do
10
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
11
+ assert_equal ["/foo"], robots.disallows
12
+ end
13
+
14
+ test "#[] should return all disallows for the specified user agent" do
15
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"]
16
+ assert_equal ["/foo"], robots["Test"]
17
+ end
18
+
19
+ test "#[] should return all disallows for the specified user agent including wildcard" do
20
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
21
+ assert_equal ["/foo", "/bar"], robots["Test"]
22
+ end
23
+
24
+ test "#[] should return all disallows for wildcard" do
25
+ robots = Scrape::RobotsTxt.new "Test" => ["/foo"], "*" => ["/bar"]
26
+ assert_equal ["/bar"], robots["*"]
27
+ end
28
+
29
+ test ".parse should return new instance parsed from a string" do
30
+ robots = Scrape::RobotsTxt.parse <<-TXT
31
+ User-agent: Test
32
+ Disallow: /foo
33
+ Disallow: /bar
34
+ TXT
35
+
36
+ assert_equal ["Test"], robots.user_agents
37
+ assert_equal ["/foo", "/bar"], robots.disallows
38
+ end
39
+
40
+ test ".parse should return new empty instance" do
41
+ robots = Scrape::RobotsTxt.parse ""
42
+ assert_equal [], robots.user_agents
43
+ assert_equal [], robots.disallows
44
+ end
45
+
46
+ test ".load should return a new instance parsed from the specified url" do
47
+ stub_request(:get, "http://www.example.com/robots.txt").
48
+ to_return(:status => 200, :body => <<-TXT)
49
+ User-agent: Test
50
+ Disallow: /foo
51
+ Disallow: /bar
52
+ TXT
53
+
54
+ robots = Scrape::RobotsTxt.load "http://www.example.com/robots.txt"
55
+ assert_equal ["Test"], robots.user_agents
56
+ assert_equal ["/foo", "/bar"], robots.disallows
57
+ end
58
+
59
+ test ".load should return a new instance parsed from the specified url with the path defaulted" do
60
+ stub_request(:get, "http://www.example.com/robots.txt").
61
+ to_return(:status => 200, :body => <<-TXT)
62
+ User-agent: Test
63
+ Disallow: /foo
64
+ Disallow: /bar
65
+ TXT
66
+
67
+ robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
68
+ assert_equal ["Test"], robots.user_agents
69
+ assert_equal ["/foo", "/bar"], robots.disallows
70
+ end
71
+ end
@@ -24,8 +24,8 @@ class SiteTest < Scrape::TestCase
24
24
  assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
25
25
  end
26
26
 
27
- test "#parse should return relative urls to the site" do
28
- stub_request(:get, "http://www.example.com/test").
27
+ test "#parse should return relative urls to the specified url" do
28
+ stub_request(:get, "http://www.example.com/foo/bar").
29
29
  with(:headers => {"User-Agent" => Scrape.user_agent}).
30
30
  to_return(:status => 200, :body => <<-HTML)
31
31
  <html>
@@ -36,7 +36,7 @@ class SiteTest < Scrape::TestCase
36
36
  HTML
37
37
 
38
38
  site = Scrape::Site.new "http://www.example.com"
39
- assert_equal ["http://www.example.com/link1.html"], site.parse("/test")
39
+ assert_equal ["http://www.example.com/foo/link1.html"], site.parse("/foo/bar")
40
40
  end
41
41
 
42
42
  test "#parse should return no urls" do
@@ -73,9 +73,36 @@ class SiteTest < Scrape::TestCase
73
73
  assert ok, "Match was not invoked"
74
74
  end
75
75
 
76
- test "#accept? should return true when specified url inside the site's url" do
77
- uri = Scrape::Site.new "http://www.example.com/foo"
78
- assert uri.accept?("http://www.example.com/foo/bar")
76
+ test "#accept? should return true when specified url is inside the site's url" do
77
+ site = Scrape::Site.new "http://www.example.com/foo"
78
+ assert site.accept?("http://www.example.com/foo/bar")
79
+ end
80
+
81
+ test "#accept? should return false when specified url is outside the site's url" do
82
+ site = Scrape::Site.new "http://www.example.com/foo"
83
+ refute site.accept?("http://www.example.com/bar")
84
+ end
85
+
86
+ test "#accept? should return true when specified url is inside the site's url and allowed by robots.txt" do
87
+ stub_request(:get, "http://www.example.com/robots.txt").
88
+ to_return(:status => 200, :body => <<-TXT)
89
+ User-agent: #{Scrape.user_agent}
90
+ Disallow: /bar
91
+ TXT
92
+
93
+ site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
94
+ assert site.accept?("http://www.example.com/foo/bar")
95
+ end
96
+
97
+ test "#accept? should return false when specified url is inside the site's url and disallowed by robots.txt" do
98
+ stub_request(:get, "http://www.example.com/robots.txt").
99
+ to_return(:status => 200, :body => <<-TXT)
100
+ User-agent: #{Scrape.user_agent}
101
+ Disallow: /foo
102
+ TXT
103
+
104
+ site = Scrape::Site.new "http://www.example.com/foo", :ignore_robots_txt => false
105
+ refute site.accept?("http://www.example.com/foo/bar"), "URL should not be accepted"
79
106
  end
80
107
 
81
108
  test "#normalize should return a url when string begins with a slash" do
@@ -84,7 +111,7 @@ class SiteTest < Scrape::TestCase
84
111
  end
85
112
 
86
113
  test "#normalize should return a url with the string appended" do
87
- site = Scrape::Site.new "http://www.example.com/foo"
114
+ site = Scrape::Site.new "http://www.example.com/foo/boo"
88
115
  assert_equal "http://www.example.com/foo/bar", site.normalize("bar")
89
116
  end
90
117
 
@@ -92,4 +119,21 @@ class SiteTest < Scrape::TestCase
92
119
  site = Scrape::Site.new "http://www.example.com/foo"
93
120
  assert_equal "http://www.example.org/bar", site.normalize("http://www.example.org/bar")
94
121
  end
122
+
123
+ test "#normalize should return a url when string is a forward slash" do
124
+ site = Scrape::Site.new "http://www.example.com/foo"
125
+ assert_equal "http://www.example.com/", site.normalize("/")
126
+ end
127
+
128
+ test "#robots_txt should return a RobotsTxt instance from the site's url" do
129
+ stub_request(:get, "http://www.example.com/robots.txt").
130
+ to_return(:status => 200, :body => <<-TXT)
131
+ User-agent: Test
132
+ Disallow: /foo
133
+ TXT
134
+
135
+ site = Scrape::Site.new "http://www.example.com/foo"
136
+ robots = site.robots_txt
137
+ assert_kind_of Scrape::RobotsTxt, robots
138
+ end
95
139
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-11 00:00:00.000000000 Z
12
+ date: 2012-07-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,6 +27,22 @@ dependencies:
27
27
  - - ~>
28
28
  - !ruby/object:Gem::Version
29
29
  version: 1.5.5
30
+ - !ruby/object:Gem::Dependency
31
+ name: addressable
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 2.2.8
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 2.2.8
30
46
  description: An easy to use utility to scrape websites using a DSL similar to rake.
31
47
  email:
32
48
  - evilmarty@gmail.com
@@ -50,7 +66,10 @@ files:
50
66
  - lib/scrape/default_loader.rb
51
67
  - lib/scrape/dsl.rb
52
68
  - lib/scrape/match.rb
69
+ - lib/scrape/robots_txt.rb
70
+ - lib/scrape/robots_txt_rules.rb
53
71
  - lib/scrape/site.rb
72
+ - lib/scrape/string_ext.rb
54
73
  - lib/scrape/version.rb
55
74
  - scrape.gemspec
56
75
  - test/support/test1.scrape
@@ -60,6 +79,8 @@ files:
60
79
  - test/unit/application_test.rb
61
80
  - test/unit/default_loader_test.rb
62
81
  - test/unit/match_test.rb
82
+ - test/unit/robots_txt_rules_test.rb
83
+ - test/unit/robots_txt_test.rb
63
84
  - test/unit/scrape_test.rb
64
85
  - test/unit/site_test.rb
65
86
  homepage: http://github.com/evilmarty/scrape