scrape 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  .DS_Store
2
- pkg/
2
+ pkg/
3
+ .tags*
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.2.1)
4
+ scrape (0.2.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/bin/scrape CHANGED
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $: << File.expand_path('../../lib', __FILE__)
4
-
5
- require "scrape/cli"
3
+ require File.expand_path("../../lib/scrape/cli", __FILE__)
6
4
 
7
5
  Scrape::CLI.new(File.basename($0), ARGV).run
@@ -1,30 +1,33 @@
1
1
  class Scrape::Application
2
- attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
2
+ attr_reader :scrapefile, :loader, :sites, :history, :options
3
3
 
4
- def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
4
+ def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
5
5
  @scrapefile = File.expand_path scrapefile
6
- @loader = loader
6
+ @options = options
7
+ @loader = loader.class == Class ? loader.new(self) : loader
7
8
  @sites = {}
8
9
  @queue = []
9
10
  @history = []
10
- @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
11
11
  end
12
12
 
13
13
  def run
14
14
  load_scrapefile
15
15
 
16
16
  while url = @queue.shift
17
- Scrape.logger.info "Loading: #{url}..."
18
17
  @history << url
19
- if site = self[url]
20
- if urls = site.parse(url)
21
- enqueue *urls
22
- Scrape.logger.info "Found #{urls.length} urls."
18
+ begin
19
+ if site = self[url]
20
+ if urls = site.parse(url)
21
+ enqueue *urls
22
+ Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
23
+ else
24
+ Scrape.logger.info "Parsed #{url}."
25
+ end
23
26
  else
24
- Scrape.logger.info "Done."
27
+ Scrape.logger.info "No rules defined for #{url}"
25
28
  end
26
- else
27
- Scrape.logger.info "Not defined."
29
+ rescue OpenURI::HTTPError => e
30
+ Scrape.logger.info "Error loading #{url}: #{e.message}"
28
31
  end
29
32
  end
30
33
  end
@@ -44,21 +47,22 @@ class Scrape::Application
44
47
  end
45
48
  end
46
49
 
47
- def ignore_robots_txt= bool
48
- sites.each{|_, site| site.ignore_robots_txt = bool }
49
- @ignore_robots_txt = bool
50
- end
51
-
52
50
  def [] url
53
51
  @sites.values.detect{|site| site.accept? url }
54
52
  end
55
53
 
54
+ def add_site site, options = {}
55
+ case site
56
+ when String
57
+ site = Scrape::Site.new site, options
58
+ @sites.update site.to_s => site
59
+ site
60
+ end
61
+ end
62
+
56
63
  def load_scrapefile
57
64
  return if @scrapefile_loaded
58
- raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
59
- result = loader.load scrapefile
60
- @sites.update result if result.is_a? Hash
61
- self.ignore_robots_txt = ignore_robots_txt
65
+ loader.load(scrapefile)
62
66
  reset
63
67
  @scrapefile_loaded = true
64
68
  end
data/lib/scrape/cli.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require "optparse"
2
- require "scrape"
2
+ require File.expand_path("../../scrape", __FILE__)
3
3
 
4
4
  class Scrape::CLI
5
5
  attr_reader :command, :app, :options
@@ -0,0 +1,5 @@
1
+ class Array
2
+ def extract_options!
3
+ last.instance_of?(Hash) ? pop : {}
4
+ end unless instance_methods.include?(:extract_options!)
5
+ end
File without changes
@@ -1,19 +1,12 @@
1
1
  class Scrape::DefaultLoader
2
- def load path
3
- path = File.expand_path path
4
- sites = {}
5
-
6
- sandbox = Sandbox.new sites
7
- sandbox.instance_eval File.read(path), path
8
-
9
- sites
2
+ def initialize app
3
+ @app = app
10
4
  end
11
5
 
12
- class Sandbox
13
- include Scrape::DSL
14
-
15
- def initialize sites
16
- @sites = sites
17
- end
6
+ def load path
7
+ path = File.expand_path path
8
+ File.exists? path or raise Scrape::FileNotFound, path
9
+ dsl = Scrape::DSL.new @app
10
+ dsl.instance_eval File.read(path), path
18
11
  end
19
12
  end
data/lib/scrape/dsl.rb CHANGED
@@ -1,13 +1,18 @@
1
- module Scrape::DSL
1
+ class Scrape::DSL
2
+ def initialize app
3
+ @application = app
4
+ end
5
+
2
6
  def site *urls
3
- @_sites ||= {}
4
- @sites ||= {}
5
- @current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
7
+ return @sites if urls.empty?
8
+ urls = urls.flatten
9
+ options = urls.extract_options!
10
+ @sites = urls.map{|url| @application.sites[url] || @application.add_site(url, options) }
6
11
  end
7
12
 
8
13
  def match matcher, &proc
9
- raise ArgumentError.new("site must be set") unless defined? @current_sites
10
- matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
14
+ raise ArgumentError, "No sites have been defined" unless defined? @sites
15
+ matches = @sites.map{|site| site.add_match matcher, &proc }
11
16
  matches.size == 1 ? matches.first : matches
12
17
  end
13
18
  end
@@ -49,6 +49,8 @@ class Scrape::RobotsTxt
49
49
  def self.load url, default = true
50
50
  url = Addressable::URI.join(url, "/robots.txt") if default
51
51
  parse Scrape.open(url)
52
+ rescue OpenURI::HTTPError
53
+ nil
52
54
  end
53
55
  public :load
54
56
  end
data/lib/scrape/site.rb CHANGED
@@ -2,15 +2,14 @@ require 'addressable/uri'
2
2
  require 'nokogiri'
3
3
 
4
4
  class Scrape::Site
5
- attr_reader :url, :matches
6
- attr_accessor :ignore_robots_txt
5
+ attr_reader :url, :matches, :options
7
6
 
8
7
  def initialize url, options = {}
9
8
  @url = Addressable::URI.parse url
10
9
  @url.query = nil
11
10
  @url.fragment = nil
11
+ @options = {:ignore_robots_txt => true}.merge options
12
12
  @matches = []
13
- @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
14
13
  end
15
14
 
16
15
  def add_match matcher, &proc
@@ -19,9 +18,15 @@ class Scrape::Site
19
18
  match
20
19
  end
21
20
 
21
+ def open url
22
+ headers = Hash.new
23
+ headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
24
+ Scrape.open url, headers
25
+ end
26
+
22
27
  def parse url
23
28
  url = normalize url
24
- doc = Nokogiri::HTML Scrape.open(url)
29
+ doc = Nokogiri::HTML open(url)
25
30
 
26
31
  @matches.each{|match| match.invoke doc, url if match =~ url }
27
32
 
@@ -38,11 +43,7 @@ class Scrape::Site
38
43
  end
39
44
 
40
45
  def robots_txt
41
- @robots_txt ||= Scrape::RobotsTxt.load url
42
- end
43
-
44
- def ignore_robots_txt?
45
- !!@ignore_robots_txt
46
+ @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
46
47
  end
47
48
 
48
49
  def to_s
@@ -52,6 +53,6 @@ class Scrape::Site
52
53
  private
53
54
 
54
55
  def disallowed? url
55
- !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
56
+ !options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
56
57
  end
57
58
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.2.2' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.2.4' unless defined? ::Scrape::VERSION
3
3
  end
data/lib/scrape.rb CHANGED
@@ -1,13 +1,14 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
3
  require "open-uri"
4
- require "bundler/setup"
5
4
 
6
- require "scrape/string_ext.rb"
5
+ $: << File.dirname(__FILE__)
7
6
 
8
- module Scrape
9
- require 'scrape/version'
7
+ require "scrape/version"
8
+ require "scrape/core_ext/array"
9
+ require "scrape/core_ext/string"
10
10
 
11
+ module Scrape
11
12
  autoload 'Application', 'scrape/application'
12
13
  autoload 'Site', 'scrape/site'
13
14
  autoload 'Match', 'scrape/match'
data/scrape.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.files = `git ls-files`.split("\n")
17
17
  s.test_files = `git ls-files -- {test}/*`.split("\n")
18
18
  s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
19
- s.require_paths = ["lib"]
19
+ s.require_paths = ["lib", "lib/scrape"]
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
22
  s.add_development_dependency "addressable", "~> 2.2.8"
@@ -53,6 +53,7 @@ class ApplicationTest < Scrape::TestCase
53
53
  test "#run should load the specified file" do
54
54
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
55
  test_loader = MiniTest::Mock.new
56
+ test_loader.expect :class, Scrape::DefaultLoader
56
57
  test_loader.expect :load, nil, [filepath]
57
58
  Scrape::Application.new(filepath, {}, test_loader).run
58
59
  assert test_loader.verify, "loader did not receive file"
@@ -70,12 +71,9 @@ class ApplicationTest < Scrape::TestCase
70
71
  assert_equal ["http://example.com"], app.queue
71
72
  end
72
73
 
73
- test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
74
- site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
74
+ test "#add_site should add the specied string to the collection" do
75
75
  app = Scrape::Application.new(".")
76
- app.sites.update site.to_s => site
77
- assert_equal false, site.ignore_robots_txt
78
- app.ignore_robots_txt = true
79
- assert_equal true, site.ignore_robots_txt
76
+ app.add_site "http://example.com"
77
+ assert app.sites.member?("http://example.com")
80
78
  end
81
79
  end
@@ -3,23 +3,27 @@ require "test_helper"
3
3
  class DefaultLoaderTest < Scrape::TestCase
4
4
  SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
5
 
6
- test "#load should return sites parsed from the specified file" do
7
- loader = Scrape::DefaultLoader.new
8
- sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
9
- assert_equal ["http://example.com"], sites.keys
10
- assert_instance_of Scrape::Site, sites.values[0]
6
+ test "#load should parse the specified file" do
7
+ app = Scrape::Application.new "."
8
+ loader = Scrape::DefaultLoader.new app
9
+ loader.load File.join(SUPPORT_FILES, "test1.scrape")
10
+ assert_equal ["http://example.com"], app.sites.keys
11
+ assert_instance_of Scrape::Site, app.sites.values[0]
11
12
  end
12
13
 
13
- test "#load should return an empty hash when no matches have been defined" do
14
- loader = Scrape::DefaultLoader.new
15
- sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
16
- assert_equal Hash.new, sites
17
- end
18
-
19
- test "#load should raise an error when no site is defined" do
20
- loader = Scrape::DefaultLoader.new
14
+ test "#load should raise error when no site is defined" do
15
+ app = Scrape::Application.new "."
16
+ loader = Scrape::DefaultLoader.new app
21
17
  assert_raises ArgumentError do
22
18
  loader.load File.join(SUPPORT_FILES, "test3.scrape")
23
19
  end
24
20
  end
21
+
22
+ test "#load should raise error when file cannot be found" do
23
+ app = Scrape::Application.new "."
24
+ loader = Scrape::DefaultLoader.new app
25
+ assert_raises Scrape::FileNotFound do
26
+ loader.load "#{Time.now.to_i}.txt"
27
+ end
28
+ end
25
29
  end
@@ -0,0 +1,43 @@
1
+ require "test_helper"
2
+
3
+ class DSLTest < Scrape::TestCase
4
+ test "#site should add the url to the application" do
5
+ app = Scrape::Application.new(".")
6
+ dsl = Scrape::DSL.new app
7
+ dsl.site "http://example.com"
8
+ assert app.sites.member?("http://example.com")
9
+ end
10
+
11
+ test "#site should return the currently defined sites" do
12
+ app = Scrape::Application.new(".")
13
+ dsl = Scrape::DSL.new app
14
+ sites = dsl.site "http://example.com"
15
+ assert_equal "http://example.com", sites[0].to_s
16
+ assert_equal sites, dsl.site
17
+ end
18
+
19
+ test "#site should pass the options to the site" do
20
+ app = Scrape::Application.new(".")
21
+ dsl = Scrape::DSL.new app
22
+ dsl.site "http://example.com", :test => true
23
+ assert_equal true, app.sites["http://example.com"].options[:test]
24
+ end
25
+
26
+ test "#match should create a match on the current sites" do
27
+ app = Scrape::Application.new(".")
28
+ dsl = Scrape::DSL.new app
29
+ dsl.site "http://example.com"
30
+ site = app.sites["http://example.com"]
31
+ assert_empty site.matches
32
+ dsl.match("test"){|*args|}
33
+ refute_empty site.matches
34
+ end
35
+
36
+ test "#match should raise an error when no sites have been defined" do
37
+ app = Scrape::Application.new(".")
38
+ dsl = Scrape::DSL.new app
39
+ assert_raises ArgumentError do
40
+ dsl.match("test"){|*args|}
41
+ end
42
+ end
43
+ end
@@ -68,4 +68,12 @@ class RobotsTxtTest < Scrape::TestCase
68
68
  assert_equal ["Test"], robots.user_agents
69
69
  assert_equal ["/foo", "/bar"], robots.disallows
70
70
  end
71
+
72
+ test ".load should return nil when specified url results in 404" do
73
+ stub_request(:get, "http://www.example.com/robots.txt").
74
+ to_return(:status => 404, :body => "")
75
+
76
+ robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
77
+ assert_nil robots
78
+ end
71
79
  end
@@ -7,6 +7,15 @@ class SiteTest < Scrape::TestCase
7
7
  assert_instance_of Scrape::Match, match
8
8
  end
9
9
 
10
+ test "#open should include cookie header when cookie option is set" do
11
+ stub_request(:get, "http://www.example.com/").
12
+ with(:headers => {'Set-Cookie'=>'omnom'}).
13
+ to_return(:status => 200, :body => "")
14
+
15
+ site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
16
+ site.open "http://www.example.com"
17
+ end
18
+
10
19
  test "#parse should return absolute urls that match the site's url" do
11
20
  stub_request(:get, "http://www.example.com/test").
12
21
  with(:headers => {"User-Agent" => Scrape.user_agent}).
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-15 00:00:00.000000000 Z
12
+ date: 2012-07-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -63,13 +63,14 @@ files:
63
63
  - lib/scrape.rb
64
64
  - lib/scrape/application.rb
65
65
  - lib/scrape/cli.rb
66
+ - lib/scrape/core_ext/array.rb
67
+ - lib/scrape/core_ext/string.rb
66
68
  - lib/scrape/default_loader.rb
67
69
  - lib/scrape/dsl.rb
68
70
  - lib/scrape/match.rb
69
71
  - lib/scrape/robots_txt.rb
70
72
  - lib/scrape/robots_txt_rules.rb
71
73
  - lib/scrape/site.rb
72
- - lib/scrape/string_ext.rb
73
74
  - lib/scrape/version.rb
74
75
  - scrape.gemspec
75
76
  - test/support/test1.scrape
@@ -79,6 +80,7 @@ files:
79
80
  - test/unit/application_test.rb
80
81
  - test/unit/cli_test.rb
81
82
  - test/unit/default_loader_test.rb
83
+ - test/unit/dsl_test.rb
82
84
  - test/unit/match_test.rb
83
85
  - test/unit/robots_txt_rules_test.rb
84
86
  - test/unit/robots_txt_test.rb
@@ -90,6 +92,7 @@ post_install_message:
90
92
  rdoc_options: []
91
93
  require_paths:
92
94
  - lib
95
+ - lib/scrape
93
96
  required_ruby_version: !ruby/object:Gem::Requirement
94
97
  none: false
95
98
  requirements: