scrape 0.2.2 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  .DS_Store
2
- pkg/
2
+ pkg/
3
+ .tags*
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.2.1)
4
+ scrape (0.2.4)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/bin/scrape CHANGED
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $: << File.expand_path('../../lib', __FILE__)
4
-
5
- require "scrape/cli"
3
+ require File.expand_path("../../lib/scrape/cli", __FILE__)
6
4
 
7
5
  Scrape::CLI.new(File.basename($0), ARGV).run
@@ -1,30 +1,33 @@
1
1
  class Scrape::Application
2
- attr_reader :scrapefile, :loader, :sites, :history, :ignore_robots_txt
2
+ attr_reader :scrapefile, :loader, :sites, :history, :options
3
3
 
4
- def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader.new
4
+ def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
5
5
  @scrapefile = File.expand_path scrapefile
6
- @loader = loader
6
+ @options = options
7
+ @loader = loader.class == Class ? loader.new(self) : loader
7
8
  @sites = {}
8
9
  @queue = []
9
10
  @history = []
10
- @ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
11
11
  end
12
12
 
13
13
  def run
14
14
  load_scrapefile
15
15
 
16
16
  while url = @queue.shift
17
- Scrape.logger.info "Loading: #{url}..."
18
17
  @history << url
19
- if site = self[url]
20
- if urls = site.parse(url)
21
- enqueue *urls
22
- Scrape.logger.info "Found #{urls.length} urls."
18
+ begin
19
+ if site = self[url]
20
+ if urls = site.parse(url)
21
+ enqueue *urls
22
+ Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
23
+ else
24
+ Scrape.logger.info "Parsed #{url}."
25
+ end
23
26
  else
24
- Scrape.logger.info "Done."
27
+ Scrape.logger.info "No rules defined for #{url}"
25
28
  end
26
- else
27
- Scrape.logger.info "Not defined."
29
+ rescue OpenURI::HTTPError => e
30
+ Scrape.logger.info "Error loading #{url}: #{e.message}"
28
31
  end
29
32
  end
30
33
  end
@@ -44,21 +47,22 @@ class Scrape::Application
44
47
  end
45
48
  end
46
49
 
47
- def ignore_robots_txt= bool
48
- sites.each{|_, site| site.ignore_robots_txt = bool }
49
- @ignore_robots_txt = bool
50
- end
51
-
52
50
  def [] url
53
51
  @sites.values.detect{|site| site.accept? url }
54
52
  end
55
53
 
54
+ def add_site site, options = {}
55
+ case site
56
+ when String
57
+ site = Scrape::Site.new site, options
58
+ @sites.update site.to_s => site
59
+ site
60
+ end
61
+ end
62
+
56
63
  def load_scrapefile
57
64
  return if @scrapefile_loaded
58
- raise Scrape::FileNotFound.new(scrapefile) unless File.exists? scrapefile
59
- result = loader.load scrapefile
60
- @sites.update result if result.is_a? Hash
61
- self.ignore_robots_txt = ignore_robots_txt
65
+ loader.load(scrapefile)
62
66
  reset
63
67
  @scrapefile_loaded = true
64
68
  end
data/lib/scrape/cli.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require "optparse"
2
- require "scrape"
2
+ require File.expand_path("../../scrape", __FILE__)
3
3
 
4
4
  class Scrape::CLI
5
5
  attr_reader :command, :app, :options
@@ -0,0 +1,5 @@
1
+ class Array
2
+ def extract_options!
3
+ last.instance_of?(Hash) ? pop : {}
4
+ end unless instance_methods.include?(:extract_options!)
5
+ end
File without changes
@@ -1,19 +1,12 @@
1
1
  class Scrape::DefaultLoader
2
- def load path
3
- path = File.expand_path path
4
- sites = {}
5
-
6
- sandbox = Sandbox.new sites
7
- sandbox.instance_eval File.read(path), path
8
-
9
- sites
2
+ def initialize app
3
+ @app = app
10
4
  end
11
5
 
12
- class Sandbox
13
- include Scrape::DSL
14
-
15
- def initialize sites
16
- @sites = sites
17
- end
6
+ def load path
7
+ path = File.expand_path path
8
+ File.exists? path or raise Scrape::FileNotFound, path
9
+ dsl = Scrape::DSL.new @app
10
+ dsl.instance_eval File.read(path), path
18
11
  end
19
12
  end
data/lib/scrape/dsl.rb CHANGED
@@ -1,13 +1,18 @@
1
- module Scrape::DSL
1
+ class Scrape::DSL
2
+ def initialize app
3
+ @application = app
4
+ end
5
+
2
6
  def site *urls
3
- @_sites ||= {}
4
- @sites ||= {}
5
- @current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
7
+ return @sites if urls.empty?
8
+ urls = urls.flatten
9
+ options = urls.extract_options!
10
+ @sites = urls.map{|url| @application.sites[url] || @application.add_site(url, options) }
6
11
  end
7
12
 
8
13
  def match matcher, &proc
9
- raise ArgumentError.new("site must be set") unless defined? @current_sites
10
- matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
14
+ raise ArgumentError, "No sites have been defined" unless defined? @sites
15
+ matches = @sites.map{|site| site.add_match matcher, &proc }
11
16
  matches.size == 1 ? matches.first : matches
12
17
  end
13
18
  end
@@ -49,6 +49,8 @@ class Scrape::RobotsTxt
49
49
  def self.load url, default = true
50
50
  url = Addressable::URI.join(url, "/robots.txt") if default
51
51
  parse Scrape.open(url)
52
+ rescue OpenURI::HTTPError
53
+ nil
52
54
  end
53
55
  public :load
54
56
  end
data/lib/scrape/site.rb CHANGED
@@ -2,15 +2,14 @@ require 'addressable/uri'
2
2
  require 'nokogiri'
3
3
 
4
4
  class Scrape::Site
5
- attr_reader :url, :matches
6
- attr_accessor :ignore_robots_txt
5
+ attr_reader :url, :matches, :options
7
6
 
8
7
  def initialize url, options = {}
9
8
  @url = Addressable::URI.parse url
10
9
  @url.query = nil
11
10
  @url.fragment = nil
11
+ @options = {:ignore_robots_txt => true}.merge options
12
12
  @matches = []
13
- @ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
14
13
  end
15
14
 
16
15
  def add_match matcher, &proc
@@ -19,9 +18,15 @@ class Scrape::Site
19
18
  match
20
19
  end
21
20
 
21
+ def open url
22
+ headers = Hash.new
23
+ headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
24
+ Scrape.open url, headers
25
+ end
26
+
22
27
  def parse url
23
28
  url = normalize url
24
- doc = Nokogiri::HTML Scrape.open(url)
29
+ doc = Nokogiri::HTML open(url)
25
30
 
26
31
  @matches.each{|match| match.invoke doc, url if match =~ url }
27
32
 
@@ -38,11 +43,7 @@ class Scrape::Site
38
43
  end
39
44
 
40
45
  def robots_txt
41
- @robots_txt ||= Scrape::RobotsTxt.load url
42
- end
43
-
44
- def ignore_robots_txt?
45
- !!@ignore_robots_txt
46
+ @robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
46
47
  end
47
48
 
48
49
  def to_s
@@ -52,6 +53,6 @@ class Scrape::Site
52
53
  private
53
54
 
54
55
  def disallowed? url
55
- !ignore_robots_txt? && robots_txt =~ Addressable::URI.parse(url).path
56
+ !options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
56
57
  end
57
58
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.2.2' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.2.4' unless defined? ::Scrape::VERSION
3
3
  end
data/lib/scrape.rb CHANGED
@@ -1,13 +1,14 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
3
  require "open-uri"
4
- require "bundler/setup"
5
4
 
6
- require "scrape/string_ext.rb"
5
+ $: << File.dirname(__FILE__)
7
6
 
8
- module Scrape
9
- require 'scrape/version'
7
+ require "scrape/version"
8
+ require "scrape/core_ext/array"
9
+ require "scrape/core_ext/string"
10
10
 
11
+ module Scrape
11
12
  autoload 'Application', 'scrape/application'
12
13
  autoload 'Site', 'scrape/site'
13
14
  autoload 'Match', 'scrape/match'
data/scrape.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.files = `git ls-files`.split("\n")
17
17
  s.test_files = `git ls-files -- {test}/*`.split("\n")
18
18
  s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
19
- s.require_paths = ["lib"]
19
+ s.require_paths = ["lib", "lib/scrape"]
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
22
  s.add_development_dependency "addressable", "~> 2.2.8"
@@ -53,6 +53,7 @@ class ApplicationTest < Scrape::TestCase
53
53
  test "#run should load the specified file" do
54
54
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
55
55
  test_loader = MiniTest::Mock.new
56
+ test_loader.expect :class, Scrape::DefaultLoader
56
57
  test_loader.expect :load, nil, [filepath]
57
58
  Scrape::Application.new(filepath, {}, test_loader).run
58
59
  assert test_loader.verify, "loader did not receive file"
@@ -70,12 +71,9 @@ class ApplicationTest < Scrape::TestCase
70
71
  assert_equal ["http://example.com"], app.queue
71
72
  end
72
73
 
73
- test "#ignore_robots_txt should update #ignore_robots_txt on all sites" do
74
- site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
74
+ test "#add_site should add the specied string to the collection" do
75
75
  app = Scrape::Application.new(".")
76
- app.sites.update site.to_s => site
77
- assert_equal false, site.ignore_robots_txt
78
- app.ignore_robots_txt = true
79
- assert_equal true, site.ignore_robots_txt
76
+ app.add_site "http://example.com"
77
+ assert app.sites.member?("http://example.com")
80
78
  end
81
79
  end
@@ -3,23 +3,27 @@ require "test_helper"
3
3
  class DefaultLoaderTest < Scrape::TestCase
4
4
  SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
5
5
 
6
- test "#load should return sites parsed from the specified file" do
7
- loader = Scrape::DefaultLoader.new
8
- sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
9
- assert_equal ["http://example.com"], sites.keys
10
- assert_instance_of Scrape::Site, sites.values[0]
6
+ test "#load should parse the specified file" do
7
+ app = Scrape::Application.new "."
8
+ loader = Scrape::DefaultLoader.new app
9
+ loader.load File.join(SUPPORT_FILES, "test1.scrape")
10
+ assert_equal ["http://example.com"], app.sites.keys
11
+ assert_instance_of Scrape::Site, app.sites.values[0]
11
12
  end
12
13
 
13
- test "#load should return an empty hash when no matches have been defined" do
14
- loader = Scrape::DefaultLoader.new
15
- sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
16
- assert_equal Hash.new, sites
17
- end
18
-
19
- test "#load should raise an error when no site is defined" do
20
- loader = Scrape::DefaultLoader.new
14
+ test "#load should raise error when no site is defined" do
15
+ app = Scrape::Application.new "."
16
+ loader = Scrape::DefaultLoader.new app
21
17
  assert_raises ArgumentError do
22
18
  loader.load File.join(SUPPORT_FILES, "test3.scrape")
23
19
  end
24
20
  end
21
+
22
+ test "#load should raise error when file cannot be found" do
23
+ app = Scrape::Application.new "."
24
+ loader = Scrape::DefaultLoader.new app
25
+ assert_raises Scrape::FileNotFound do
26
+ loader.load "#{Time.now.to_i}.txt"
27
+ end
28
+ end
25
29
  end
@@ -0,0 +1,43 @@
1
+ require "test_helper"
2
+
3
+ class DSLTest < Scrape::TestCase
4
+ test "#site should add the url to the application" do
5
+ app = Scrape::Application.new(".")
6
+ dsl = Scrape::DSL.new app
7
+ dsl.site "http://example.com"
8
+ assert app.sites.member?("http://example.com")
9
+ end
10
+
11
+ test "#site should return the currently defined sites" do
12
+ app = Scrape::Application.new(".")
13
+ dsl = Scrape::DSL.new app
14
+ sites = dsl.site "http://example.com"
15
+ assert_equal "http://example.com", sites[0].to_s
16
+ assert_equal sites, dsl.site
17
+ end
18
+
19
+ test "#site should pass the options to the site" do
20
+ app = Scrape::Application.new(".")
21
+ dsl = Scrape::DSL.new app
22
+ dsl.site "http://example.com", :test => true
23
+ assert_equal true, app.sites["http://example.com"].options[:test]
24
+ end
25
+
26
+ test "#match should create a match on the current sites" do
27
+ app = Scrape::Application.new(".")
28
+ dsl = Scrape::DSL.new app
29
+ dsl.site "http://example.com"
30
+ site = app.sites["http://example.com"]
31
+ assert_empty site.matches
32
+ dsl.match("test"){|*args|}
33
+ refute_empty site.matches
34
+ end
35
+
36
+ test "#match should raise an error when no sites have been defined" do
37
+ app = Scrape::Application.new(".")
38
+ dsl = Scrape::DSL.new app
39
+ assert_raises ArgumentError do
40
+ dsl.match("test"){|*args|}
41
+ end
42
+ end
43
+ end
@@ -68,4 +68,12 @@ class RobotsTxtTest < Scrape::TestCase
68
68
  assert_equal ["Test"], robots.user_agents
69
69
  assert_equal ["/foo", "/bar"], robots.disallows
70
70
  end
71
+
72
+ test ".load should return nil when specified url results in 404" do
73
+ stub_request(:get, "http://www.example.com/robots.txt").
74
+ to_return(:status => 404, :body => "")
75
+
76
+ robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
77
+ assert_nil robots
78
+ end
71
79
  end
@@ -7,6 +7,15 @@ class SiteTest < Scrape::TestCase
7
7
  assert_instance_of Scrape::Match, match
8
8
  end
9
9
 
10
+ test "#open should include cookie header when cookie option is set" do
11
+ stub_request(:get, "http://www.example.com/").
12
+ with(:headers => {'Set-Cookie'=>'omnom'}).
13
+ to_return(:status => 200, :body => "")
14
+
15
+ site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
16
+ site.open "http://www.example.com"
17
+ end
18
+
10
19
  test "#parse should return absolute urls that match the site's url" do
11
20
  stub_request(:get, "http://www.example.com/test").
12
21
  with(:headers => {"User-Agent" => Scrape.user_agent}).
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-15 00:00:00.000000000 Z
12
+ date: 2012-07-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -63,13 +63,14 @@ files:
63
63
  - lib/scrape.rb
64
64
  - lib/scrape/application.rb
65
65
  - lib/scrape/cli.rb
66
+ - lib/scrape/core_ext/array.rb
67
+ - lib/scrape/core_ext/string.rb
66
68
  - lib/scrape/default_loader.rb
67
69
  - lib/scrape/dsl.rb
68
70
  - lib/scrape/match.rb
69
71
  - lib/scrape/robots_txt.rb
70
72
  - lib/scrape/robots_txt_rules.rb
71
73
  - lib/scrape/site.rb
72
- - lib/scrape/string_ext.rb
73
74
  - lib/scrape/version.rb
74
75
  - scrape.gemspec
75
76
  - test/support/test1.scrape
@@ -79,6 +80,7 @@ files:
79
80
  - test/unit/application_test.rb
80
81
  - test/unit/cli_test.rb
81
82
  - test/unit/default_loader_test.rb
83
+ - test/unit/dsl_test.rb
82
84
  - test/unit/match_test.rb
83
85
  - test/unit/robots_txt_rules_test.rb
84
86
  - test/unit/robots_txt_test.rb
@@ -90,6 +92,7 @@ post_install_message:
90
92
  rdoc_options: []
91
93
  require_paths:
92
94
  - lib
95
+ - lib/scrape
93
96
  required_ruby_version: !ruby/object:Gem::Requirement
94
97
  none: false
95
98
  requirements: