scrape 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.2.4)
4
+ scrape (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
8
8
  specs:
9
9
  addressable (2.2.8)
10
10
  crack (0.3.1)
11
+ faraday (0.8.1)
12
+ multipart-post (~> 1.1)
13
+ faraday_middleware (0.8.8)
14
+ faraday (>= 0.7.4, < 0.9)
15
+ multipart-post (1.1.5)
11
16
  nokogiri (1.5.5)
12
17
  webmock (1.8.7)
13
18
  addressable (>= 2.2.7)
@@ -18,6 +23,8 @@ PLATFORMS
18
23
 
19
24
  DEPENDENCIES
20
25
  addressable (~> 2.2.8)
26
+ faraday (~> 0.8.0)
27
+ faraday_middleware (~> 0.8.8)
21
28
  nokogiri (~> 1.5.5)
22
29
  scrape!
23
30
  webmock (~> 1.8.7)
data/README.md CHANGED
@@ -13,6 +13,8 @@ end
13
13
 
14
14
  site "http://www.tumblr.com" # Can define multiple sites
15
15
 
16
+ queue "http://www.tumblr.com/tagged" # Add specified urls to scrape
17
+
16
18
  match "/tagged" do |doc|
17
19
  # Do what ever we want with the document.
18
20
  end
@@ -1,6 +1,8 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
- require "open-uri"
3
+ require "addressable/uri"
4
+ require "faraday"
5
+ require "faraday_middleware"
4
6
 
5
7
  $: << File.dirname(__FILE__)
6
8
 
@@ -18,6 +20,7 @@ module Scrape
18
20
  autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
19
21
 
20
22
  class FileNotFound < Exception; end
23
+ class HTTPError < StandardError; end
21
24
 
22
25
  class << self
23
26
  attr_writer :user_agent
@@ -38,9 +41,26 @@ module Scrape
38
41
  Application.new path
39
42
  end
40
43
 
41
- def open url, headers = {}, &block
42
- headers = {"User-Agent" => user_agent}.merge(headers)
43
- super(url, headers, &block).read
44
+ def open url, headers = nil, &block
45
+ url = Addressable::URI.parse url
46
+ headers ||= {}
47
+
48
+ conn = Faraday.new :url => url.to_s do |faraday|
49
+ faraday.response :follow_redirects, :cookies => :all, :limit => 3
50
+ faraday.adapter Faraday.default_adapter
51
+ end
52
+
53
+ conn.headers[:user_agent] = user_agent
54
+
55
+ res = conn.get url.request_uri do |req|
56
+ headers.each{|key, val| req[key] = val }
57
+ end
58
+
59
+ if res.success?
60
+ res.body
61
+ else
62
+ raise HTTPError, res.status
63
+ end
44
64
  end
45
65
  end
46
66
  end
@@ -3,38 +3,35 @@ class Scrape::Application
3
3
 
4
4
  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
5
5
  @scrapefile = File.expand_path scrapefile
6
- @options = options
6
+ @options = options.dup
7
7
  @loader = loader.class == Class ? loader.new(self) : loader
8
8
  @sites = {}
9
- @queue = []
10
- @history = []
9
+ reset
11
10
  end
12
11
 
13
12
  def run
14
13
  load_scrapefile
15
14
 
15
+ @queue = sites.values.map{|site| site.to_s } if @queue.empty?
16
+
16
17
  while url = @queue.shift
17
18
  @history << url
18
- begin
19
- if site = self[url]
20
- if urls = site.parse(url)
21
- enqueue *urls
22
- Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
23
- else
24
- Scrape.logger.info "Parsed #{url}."
25
- end
19
+ if site = self[url]
20
+ if urls = site.parse(url)
21
+ enqueue *urls
22
+ Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
26
23
  else
27
- Scrape.logger.info "No rules defined for #{url}"
24
+ Scrape.logger.info "Parsed #{url}."
28
25
  end
29
- rescue OpenURI::HTTPError => e
30
- Scrape.logger.info "Error loading #{url}: #{e.message}"
26
+ else
27
+ Scrape.logger.info "No rules defined for #{url}"
31
28
  end
32
29
  end
33
30
  end
34
31
 
35
32
  def reset
36
33
  @history = []
37
- @queue = sites.values.map{|site| site.to_s }
34
+ @queue = []
38
35
  end
39
36
 
40
37
  def queue
@@ -54,7 +51,7 @@ class Scrape::Application
54
51
  def add_site site, options = {}
55
52
  case site
56
53
  when String
57
- site = Scrape::Site.new site, options
54
+ site = Scrape::Site.new site, options.dup
58
55
  @sites.update site.to_s => site
59
56
  site
60
57
  end
@@ -63,7 +60,6 @@ class Scrape::Application
63
60
  def load_scrapefile
64
61
  return if @scrapefile_loaded
65
62
  loader.load(scrapefile)
66
- reset
67
63
  @scrapefile_loaded = true
68
64
  end
69
65
  end
@@ -1,5 +1,5 @@
1
1
  class Array
2
2
  def extract_options!
3
3
  last.instance_of?(Hash) ? pop : {}
4
- end unless instance_methods.include?(:extract_options!)
4
+ end unless Array.respond_to?(:extract_options!)
5
5
  end
@@ -2,5 +2,5 @@ class String
2
2
  def starts_with str
3
3
  str = str.to_str
4
4
  self[0, str.length] == str
5
- end unless instance_methods.include?(:starts_with)
5
+ end unless String.respond_to?(:starts_with)
6
6
  end
@@ -15,4 +15,9 @@ class Scrape::DSL
15
15
  matches = @sites.map{|site| site.add_match matcher, &proc }
16
16
  matches.size == 1 ? matches.first : matches
17
17
  end
18
+
19
+ def enqueue *urls
20
+ @application.enqueue *urls
21
+ end
22
+ alias_method :queue, :enqueue
18
23
  end
@@ -29,6 +29,7 @@ class Scrape::RobotsTxt
29
29
  end
30
30
 
31
31
  def self.parse content
32
+ return if content.nil?
32
33
  rules, user_agent = Hash.new, nil
33
34
 
34
35
  content.split("\n").each do |line|
@@ -49,7 +50,8 @@ class Scrape::RobotsTxt
49
50
  def self.load url, default = true
50
51
  url = Addressable::URI.join(url, "/robots.txt") if default
51
52
  parse Scrape.open(url)
52
- rescue OpenURI::HTTPError
53
+ rescue Scrape::HTTPError
54
+ Scrape.logger.warn "Failed to obtain robots.txt: #{url}"
53
55
  nil
54
56
  end
55
57
  public :load
@@ -20,7 +20,7 @@ class Scrape::Site
20
20
 
21
21
  def open url
22
22
  headers = Hash.new
23
- headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
23
+ headers[:cookie] = cookie if options[:cookie]
24
24
  Scrape.open url, headers
25
25
  end
26
26
 
@@ -31,6 +31,9 @@ class Scrape::Site
31
31
  @matches.each{|match| match.invoke doc, url if match =~ url }
32
32
 
33
33
  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
34
+ rescue Scrape::HTTPError => e
35
+ Scrape.logger.info "Error loading #{url}: #{e.message}"
36
+ nil
34
37
  end
35
38
 
36
39
  def accept? url
@@ -55,4 +58,18 @@ private
55
58
  def disallowed? url
56
59
  !options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
57
60
  end
61
+
62
+ def cookie
63
+ cookie = options[:cookie]
64
+ case cookie
65
+ when Hash
66
+ cookie.map{|name, val| "#{encode(name)}=#{encode(val)}" }.join("; ")
67
+ when String
68
+ cookie
69
+ end
70
+ end
71
+
72
+ def encode str
73
+ str.to_s.gsub(" ", "%20").gsub(",", "%2C").gsub(";", "%3B")
74
+ end
58
75
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.2.4' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.3.0' unless defined? ::Scrape::VERSION
3
3
  end
@@ -20,4 +20,6 @@ Gem::Specification.new do |s|
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
22
  s.add_development_dependency "addressable", "~> 2.2.8"
23
+ s.add_development_dependency "faraday", "~> 0.8.0"
24
+ s.add_development_dependency "faraday_middleware", "~> 0.8.8"
23
25
  end
@@ -8,6 +8,9 @@ Bundler.setup(:default, :test)
8
8
 
9
9
  require "scrape"
10
10
 
11
+ # surpress log messages while we're testing
12
+ Scrape.logger = Class.new{ def method_missing name, *args; end }.new
13
+
11
14
  class Scrape::TestCase < MiniTest::Unit::TestCase
12
15
  class << self
13
16
  def test name, &block
@@ -18,37 +18,33 @@ class ApplicationTest < Scrape::TestCase
18
18
  end
19
19
 
20
20
  test "#[] should return the site that matches the given url" do
21
- site1 = Scrape::Site.new "http://example.com"
22
- site2 = Scrape::Site.new "http://example.org"
23
21
  app = Scrape::Application.new(".")
24
- app.sites.update site1.to_s => site1, site2.to_s => site2
22
+ site1 = app.add_site "http://example.com"
23
+ app.add_site "http://example.org"
25
24
  assert_equal site1, app["http://example.com"]
26
25
  end
27
26
 
28
27
  test "#[] should return the site that is relative to the given url" do
29
- site1 = Scrape::Site.new "http://example.com"
30
- site2 = Scrape::Site.new "http://example.org"
31
28
  app = Scrape::Application.new(".")
32
- app.sites.update site1.to_s => site1, site2.to_s => site2
29
+ site1 = app.add_site "http://example.com"
30
+ app.add_site "http://example.org"
33
31
  assert_equal site1, app["http://example.com/test"]
34
32
  end
35
33
 
36
34
  test "#[] should return nil when no site matches the given url" do
37
- site1 = Scrape::Site.new "http://example.com"
38
- site2 = Scrape::Site.new "http://example.org"
39
35
  app = Scrape::Application.new(".")
40
- app.sites.update site1.to_s => site1, site2.to_s => site2
36
+ app.add_site "http://example.com"
37
+ app.add_site "http://example.org"
41
38
  assert_nil app["http://example.net"]
42
39
  end
43
40
 
44
- test "#reset should enqueue the sites that have been defined" do
45
- site1 = Scrape::Site.new "http://example.com"
46
- site2 = Scrape::Site.new "http://example.org"
47
- app = Scrape::Application.new(".")
48
- app.sites.update site1.to_s => site1, site2.to_s => site2
49
- app.reset
50
- assert_equal ["http://example.com", "http://example.org"], app.queue
51
- end
41
+ # test "#reset should enqueue the sites that have been defined" do
42
+ # app = Scrape::Application.new(".")
43
+ # app.add_site "http://example.com"
44
+ # app.add_site "http://example.org"
45
+ # app.reset
46
+ # assert_equal ["http://example.com", "http://example.org"], app.queue
47
+ # end
52
48
 
53
49
  test "#run should load the specified file" do
54
50
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
@@ -40,4 +40,11 @@ class DSLTest < Scrape::TestCase
40
40
  dsl.match("test"){|*args|}
41
41
  end
42
42
  end
43
+
44
+ test "#enqueue should add the specified urls to the queue" do
45
+ app = Scrape::Application.new(".")
46
+ dsl = Scrape::DSL.new app
47
+ dsl.enqueue "http://example.com"
48
+ assert_equal ["http://example.com"], app.queue
49
+ end
43
50
  end
@@ -1,25 +1,38 @@
1
1
  require "test_helper"
2
2
 
3
3
  class ScrapeTest < Scrape::TestCase
4
- test "#user_agent should return default when not set" do
4
+ test ".user_agent should return default when not set" do
5
5
  assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
6
6
  end
7
7
 
8
- test "#load_scrapefile should return a new application" do
8
+ test ".load_scrapefile should return a new application" do
9
9
  app = Scrape.load_scrapefile '.'
10
10
  assert_kind_of Scrape::Application, app
11
11
  end
12
12
 
13
- test "#open should send a request to the specified url and return the contents" do
14
- stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
15
- assert_equal "booyah", Scrape.open("http://example.com")
16
- end
17
-
18
- test "#open should set the user agent in the request header" do
13
+ test ".open should set the user agent in the request header" do
19
14
  stub_request(:get, "http://example.com/").
20
15
  with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
21
16
  to_return(:status => 200, :body => "")
22
17
  Scrape.open("http://example.com")
23
18
  assert true
24
19
  end
20
+
21
+ test ".open should redirect when response is indicates redirection" do
22
+ stub_request(:get, "http://example.com/foo").
23
+ to_return(:status => 301, :headers => {:location => "http://example.com/bar"})
24
+ stub_request(:get, "http://example.com/bar").
25
+ to_return(:status => 200, :body => "booyah")
26
+ Scrape.open("http://example.com/foo")
27
+ assert true
28
+ end
29
+
30
+ test ".open should raise error when not successful" do
31
+ stub_request(:get, "http://example.com/").
32
+ to_return(:status => 404, :body => "")
33
+
34
+ assert_raises Scrape::HTTPError do
35
+ Scrape.open("http://example.com")
36
+ end
37
+ end
25
38
  end
@@ -9,13 +9,22 @@ class SiteTest < Scrape::TestCase
9
9
 
10
10
  test "#open should include cookie header when cookie option is set" do
11
11
  stub_request(:get, "http://www.example.com/").
12
- with(:headers => {'Set-Cookie'=>'omnom'}).
12
+ with(:headers => {'Cookie' => 'omnom'}).
13
13
  to_return(:status => 200, :body => "")
14
14
 
15
15
  site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
16
16
  site.open "http://www.example.com"
17
17
  end
18
18
 
19
+ test "#open should include cookie header when cookie option is a hash" do
20
+ stub_request(:get, "http://www.example.com/").
21
+ with(:headers => {'Cookie' => 'foo=bar'}).
22
+ to_return(:status => 200, :body => "")
23
+
24
+ site = Scrape::Site.new "http://www.example.com", :cookie => {:foo => "bar"}
25
+ site.open "http://www.example.com"
26
+ end
27
+
19
28
  test "#parse should return absolute urls that match the site's url" do
20
29
  stub_request(:get, "http://www.example.com/test").
21
30
  with(:headers => {"User-Agent" => Scrape.user_agent}).
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-16 00:00:00.000000000 Z
12
+ date: 2012-07-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
45
  version: 2.2.8
46
+ - !ruby/object:Gem::Dependency
47
+ name: faraday
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.8.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.8.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: faraday_middleware
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.8.8
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.8.8
46
78
  description: An easy to use utility to scrape websites using a DSL similar to rake.
47
79
  email:
48
80
  - evilmarty@gmail.com