scrape 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrape (0.2.4)
4
+ scrape (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
8
8
  specs:
9
9
  addressable (2.2.8)
10
10
  crack (0.3.1)
11
+ faraday (0.8.1)
12
+ multipart-post (~> 1.1)
13
+ faraday_middleware (0.8.8)
14
+ faraday (>= 0.7.4, < 0.9)
15
+ multipart-post (1.1.5)
11
16
  nokogiri (1.5.5)
12
17
  webmock (1.8.7)
13
18
  addressable (>= 2.2.7)
@@ -18,6 +23,8 @@ PLATFORMS
18
23
 
19
24
  DEPENDENCIES
20
25
  addressable (~> 2.2.8)
26
+ faraday (~> 0.8.0)
27
+ faraday_middleware (~> 0.8.8)
21
28
  nokogiri (~> 1.5.5)
22
29
  scrape!
23
30
  webmock (~> 1.8.7)
data/README.md CHANGED
@@ -13,6 +13,8 @@ end
13
13
 
14
14
  site "http://www.tumblr.com" # Can define multiple sites
15
15
 
16
+ queue "http://www.tumblr.com/tagged" # Add specified urls to scrape
17
+
16
18
  match "/tagged" do |doc|
17
19
  # Do what ever we want with the document.
18
20
  end
@@ -1,6 +1,8 @@
1
1
  require "rubygems"
2
2
  require "logger"
3
- require "open-uri"
3
+ require "addressable/uri"
4
+ require "faraday"
5
+ require "faraday_middleware"
4
6
 
5
7
  $: << File.dirname(__FILE__)
6
8
 
@@ -18,6 +20,7 @@ module Scrape
18
20
  autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
19
21
 
20
22
  class FileNotFound < Exception; end
23
+ class HTTPError < StandardError; end
21
24
 
22
25
  class << self
23
26
  attr_writer :user_agent
@@ -38,9 +41,26 @@ module Scrape
38
41
  Application.new path
39
42
  end
40
43
 
41
- def open url, headers = {}, &block
42
- headers = {"User-Agent" => user_agent}.merge(headers)
43
- super(url, headers, &block).read
44
+ def open url, headers = nil, &block
45
+ url = Addressable::URI.parse url
46
+ headers ||= {}
47
+
48
+ conn = Faraday.new :url => url.to_s do |faraday|
49
+ faraday.response :follow_redirects, :cookies => :all, :limit => 3
50
+ faraday.adapter Faraday.default_adapter
51
+ end
52
+
53
+ conn.headers[:user_agent] = user_agent
54
+
55
+ res = conn.get url.request_uri do |req|
56
+ headers.each{|key, val| req[key] = val }
57
+ end
58
+
59
+ if res.success?
60
+ res.body
61
+ else
62
+ raise HTTPError, res.status
63
+ end
44
64
  end
45
65
  end
46
66
  end
@@ -3,38 +3,35 @@ class Scrape::Application
3
3
 
4
4
  def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
5
5
  @scrapefile = File.expand_path scrapefile
6
- @options = options
6
+ @options = options.dup
7
7
  @loader = loader.class == Class ? loader.new(self) : loader
8
8
  @sites = {}
9
- @queue = []
10
- @history = []
9
+ reset
11
10
  end
12
11
 
13
12
  def run
14
13
  load_scrapefile
15
14
 
15
+ @queue = sites.values.map{|site| site.to_s } if @queue.empty?
16
+
16
17
  while url = @queue.shift
17
18
  @history << url
18
- begin
19
- if site = self[url]
20
- if urls = site.parse(url)
21
- enqueue *urls
22
- Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
23
- else
24
- Scrape.logger.info "Parsed #{url}."
25
- end
19
+ if site = self[url]
20
+ if urls = site.parse(url)
21
+ enqueue *urls
22
+ Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
26
23
  else
27
- Scrape.logger.info "No rules defined for #{url}"
24
+ Scrape.logger.info "Parsed #{url}."
28
25
  end
29
- rescue OpenURI::HTTPError => e
30
- Scrape.logger.info "Error loading #{url}: #{e.message}"
26
+ else
27
+ Scrape.logger.info "No rules defined for #{url}"
31
28
  end
32
29
  end
33
30
  end
34
31
 
35
32
  def reset
36
33
  @history = []
37
- @queue = sites.values.map{|site| site.to_s }
34
+ @queue = []
38
35
  end
39
36
 
40
37
  def queue
@@ -54,7 +51,7 @@ class Scrape::Application
54
51
  def add_site site, options = {}
55
52
  case site
56
53
  when String
57
- site = Scrape::Site.new site, options
54
+ site = Scrape::Site.new site, options.dup
58
55
  @sites.update site.to_s => site
59
56
  site
60
57
  end
@@ -63,7 +60,6 @@ class Scrape::Application
63
60
  def load_scrapefile
64
61
  return if @scrapefile_loaded
65
62
  loader.load(scrapefile)
66
- reset
67
63
  @scrapefile_loaded = true
68
64
  end
69
65
  end
@@ -1,5 +1,5 @@
1
1
  class Array
2
2
  def extract_options!
3
3
  last.instance_of?(Hash) ? pop : {}
4
- end unless instance_methods.include?(:extract_options!)
4
+ end unless Array.respond_to?(:extract_options!)
5
5
  end
@@ -2,5 +2,5 @@ class String
2
2
  def starts_with str
3
3
  str = str.to_str
4
4
  self[0, str.length] == str
5
- end unless instance_methods.include?(:starts_with)
5
+ end unless String.respond_to?(:starts_with)
6
6
  end
@@ -15,4 +15,9 @@ class Scrape::DSL
15
15
  matches = @sites.map{|site| site.add_match matcher, &proc }
16
16
  matches.size == 1 ? matches.first : matches
17
17
  end
18
+
19
+ def enqueue *urls
20
+ @application.enqueue *urls
21
+ end
22
+ alias_method :queue, :enqueue
18
23
  end
@@ -29,6 +29,7 @@ class Scrape::RobotsTxt
29
29
  end
30
30
 
31
31
  def self.parse content
32
+ return if content.nil?
32
33
  rules, user_agent = Hash.new, nil
33
34
 
34
35
  content.split("\n").each do |line|
@@ -49,7 +50,8 @@ class Scrape::RobotsTxt
49
50
  def self.load url, default = true
50
51
  url = Addressable::URI.join(url, "/robots.txt") if default
51
52
  parse Scrape.open(url)
52
- rescue OpenURI::HTTPError
53
+ rescue Scrape::HTTPError
54
+ Scrape.logger.warn "Failed to obtain robots.txt: #{url}"
53
55
  nil
54
56
  end
55
57
  public :load
@@ -20,7 +20,7 @@ class Scrape::Site
20
20
 
21
21
  def open url
22
22
  headers = Hash.new
23
- headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
23
+ headers[:cookie] = cookie if options[:cookie]
24
24
  Scrape.open url, headers
25
25
  end
26
26
 
@@ -31,6 +31,9 @@ class Scrape::Site
31
31
  @matches.each{|match| match.invoke doc, url if match =~ url }
32
32
 
33
33
  doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
34
+ rescue Scrape::HTTPError => e
35
+ Scrape.logger.info "Error loading #{url}: #{e.message}"
36
+ nil
34
37
  end
35
38
 
36
39
  def accept? url
@@ -55,4 +58,18 @@ private
55
58
  def disallowed? url
56
59
  !options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
57
60
  end
61
+
62
+ def cookie
63
+ cookie = options[:cookie]
64
+ case cookie
65
+ when Hash
66
+ cookie.map{|name, val| "#{encode(name)}=#{encode(val)}" }.join("; ")
67
+ when String
68
+ cookie
69
+ end
70
+ end
71
+
72
+ def encode str
73
+ str.to_s.gsub(" ", "%20").gsub(",", "%2C").gsub(";", "%3B")
74
+ end
58
75
  end
@@ -1,3 +1,3 @@
1
1
  module Scrape
2
- VERSION = '0.2.4' unless defined? ::Scrape::VERSION
2
+ VERSION = '0.3.0' unless defined? ::Scrape::VERSION
3
3
  end
@@ -20,4 +20,6 @@ Gem::Specification.new do |s|
20
20
 
21
21
  s.add_development_dependency "nokogiri", "~> 1.5.5"
22
22
  s.add_development_dependency "addressable", "~> 2.2.8"
23
+ s.add_development_dependency "faraday", "~> 0.8.0"
24
+ s.add_development_dependency "faraday_middleware", "~> 0.8.8"
23
25
  end
@@ -8,6 +8,9 @@ Bundler.setup(:default, :test)
8
8
 
9
9
  require "scrape"
10
10
 
11
+ # surpress log messages while we're testing
12
+ Scrape.logger = Class.new{ def method_missing name, *args; end }.new
13
+
11
14
  class Scrape::TestCase < MiniTest::Unit::TestCase
12
15
  class << self
13
16
  def test name, &block
@@ -18,37 +18,33 @@ class ApplicationTest < Scrape::TestCase
18
18
  end
19
19
 
20
20
  test "#[] should return the site that matches the given url" do
21
- site1 = Scrape::Site.new "http://example.com"
22
- site2 = Scrape::Site.new "http://example.org"
23
21
  app = Scrape::Application.new(".")
24
- app.sites.update site1.to_s => site1, site2.to_s => site2
22
+ site1 = app.add_site "http://example.com"
23
+ app.add_site "http://example.org"
25
24
  assert_equal site1, app["http://example.com"]
26
25
  end
27
26
 
28
27
  test "#[] should return the site that is relative to the given url" do
29
- site1 = Scrape::Site.new "http://example.com"
30
- site2 = Scrape::Site.new "http://example.org"
31
28
  app = Scrape::Application.new(".")
32
- app.sites.update site1.to_s => site1, site2.to_s => site2
29
+ site1 = app.add_site "http://example.com"
30
+ app.add_site "http://example.org"
33
31
  assert_equal site1, app["http://example.com/test"]
34
32
  end
35
33
 
36
34
  test "#[] should return nil when no site matches the given url" do
37
- site1 = Scrape::Site.new "http://example.com"
38
- site2 = Scrape::Site.new "http://example.org"
39
35
  app = Scrape::Application.new(".")
40
- app.sites.update site1.to_s => site1, site2.to_s => site2
36
+ app.add_site "http://example.com"
37
+ app.add_site "http://example.org"
41
38
  assert_nil app["http://example.net"]
42
39
  end
43
40
 
44
- test "#reset should enqueue the sites that have been defined" do
45
- site1 = Scrape::Site.new "http://example.com"
46
- site2 = Scrape::Site.new "http://example.org"
47
- app = Scrape::Application.new(".")
48
- app.sites.update site1.to_s => site1, site2.to_s => site2
49
- app.reset
50
- assert_equal ["http://example.com", "http://example.org"], app.queue
51
- end
41
+ # test "#reset should enqueue the sites that have been defined" do
42
+ # app = Scrape::Application.new(".")
43
+ # app.add_site "http://example.com"
44
+ # app.add_site "http://example.org"
45
+ # app.reset
46
+ # assert_equal ["http://example.com", "http://example.org"], app.queue
47
+ # end
52
48
 
53
49
  test "#run should load the specified file" do
54
50
  filepath = File.join(SUPPORT_FILES, 'test1.scrape')
@@ -40,4 +40,11 @@ class DSLTest < Scrape::TestCase
40
40
  dsl.match("test"){|*args|}
41
41
  end
42
42
  end
43
+
44
+ test "#enqueue should add the specified urls to the queue" do
45
+ app = Scrape::Application.new(".")
46
+ dsl = Scrape::DSL.new app
47
+ dsl.enqueue "http://example.com"
48
+ assert_equal ["http://example.com"], app.queue
49
+ end
43
50
  end
@@ -1,25 +1,38 @@
1
1
  require "test_helper"
2
2
 
3
3
  class ScrapeTest < Scrape::TestCase
4
- test "#user_agent should return default when not set" do
4
+ test ".user_agent should return default when not set" do
5
5
  assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
6
6
  end
7
7
 
8
- test "#load_scrapefile should return a new application" do
8
+ test ".load_scrapefile should return a new application" do
9
9
  app = Scrape.load_scrapefile '.'
10
10
  assert_kind_of Scrape::Application, app
11
11
  end
12
12
 
13
- test "#open should send a request to the specified url and return the contents" do
14
- stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
15
- assert_equal "booyah", Scrape.open("http://example.com")
16
- end
17
-
18
- test "#open should set the user agent in the request header" do
13
+ test ".open should set the user agent in the request header" do
19
14
  stub_request(:get, "http://example.com/").
20
15
  with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
21
16
  to_return(:status => 200, :body => "")
22
17
  Scrape.open("http://example.com")
23
18
  assert true
24
19
  end
20
+
21
+ test ".open should redirect when response is indicates redirection" do
22
+ stub_request(:get, "http://example.com/foo").
23
+ to_return(:status => 301, :headers => {:location => "http://example.com/bar"})
24
+ stub_request(:get, "http://example.com/bar").
25
+ to_return(:status => 200, :body => "booyah")
26
+ Scrape.open("http://example.com/foo")
27
+ assert true
28
+ end
29
+
30
+ test ".open should raise error when not successful" do
31
+ stub_request(:get, "http://example.com/").
32
+ to_return(:status => 404, :body => "")
33
+
34
+ assert_raises Scrape::HTTPError do
35
+ Scrape.open("http://example.com")
36
+ end
37
+ end
25
38
  end
@@ -9,13 +9,22 @@ class SiteTest < Scrape::TestCase
9
9
 
10
10
  test "#open should include cookie header when cookie option is set" do
11
11
  stub_request(:get, "http://www.example.com/").
12
- with(:headers => {'Set-Cookie'=>'omnom'}).
12
+ with(:headers => {'Cookie' => 'omnom'}).
13
13
  to_return(:status => 200, :body => "")
14
14
 
15
15
  site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
16
16
  site.open "http://www.example.com"
17
17
  end
18
18
 
19
+ test "#open should include cookie header when cookie option is a hash" do
20
+ stub_request(:get, "http://www.example.com/").
21
+ with(:headers => {'Cookie' => 'foo=bar'}).
22
+ to_return(:status => 200, :body => "")
23
+
24
+ site = Scrape::Site.new "http://www.example.com", :cookie => {:foo => "bar"}
25
+ site.open "http://www.example.com"
26
+ end
27
+
19
28
  test "#parse should return absolute urls that match the site's url" do
20
29
  stub_request(:get, "http://www.example.com/test").
21
30
  with(:headers => {"User-Agent" => Scrape.user_agent}).
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-16 00:00:00.000000000 Z
12
+ date: 2012-07-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
45
  version: 2.2.8
46
+ - !ruby/object:Gem::Dependency
47
+ name: faraday
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.8.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.8.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: faraday_middleware
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 0.8.8
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 0.8.8
46
78
  description: An easy to use utility to scrape websites using a DSL similar to rake.
47
79
  email:
48
80
  - evilmarty@gmail.com