scrape 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +8 -1
- data/README.md +2 -0
- data/lib/scrape.rb +24 -4
- data/lib/scrape/application.rb +13 -17
- data/lib/scrape/core_ext/array.rb +1 -1
- data/lib/scrape/core_ext/string.rb +1 -1
- data/lib/scrape/dsl.rb +5 -0
- data/lib/scrape/robots_txt.rb +3 -1
- data/lib/scrape/site.rb +18 -1
- data/lib/scrape/version.rb +1 -1
- data/scrape.gemspec +2 -0
- data/test/test_helper.rb +3 -0
- data/test/unit/application_test.rb +13 -17
- data/test/unit/dsl_test.rb +7 -0
- data/test/unit/scrape_test.rb +21 -8
- data/test/unit/site_test.rb +10 -1
- metadata +34 -2
data/Gemfile.lock
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scrape (0.
|
4
|
+
scrape (0.3.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
9
|
addressable (2.2.8)
|
10
10
|
crack (0.3.1)
|
11
|
+
faraday (0.8.1)
|
12
|
+
multipart-post (~> 1.1)
|
13
|
+
faraday_middleware (0.8.8)
|
14
|
+
faraday (>= 0.7.4, < 0.9)
|
15
|
+
multipart-post (1.1.5)
|
11
16
|
nokogiri (1.5.5)
|
12
17
|
webmock (1.8.7)
|
13
18
|
addressable (>= 2.2.7)
|
@@ -18,6 +23,8 @@ PLATFORMS
|
|
18
23
|
|
19
24
|
DEPENDENCIES
|
20
25
|
addressable (~> 2.2.8)
|
26
|
+
faraday (~> 0.8.0)
|
27
|
+
faraday_middleware (~> 0.8.8)
|
21
28
|
nokogiri (~> 1.5.5)
|
22
29
|
scrape!
|
23
30
|
webmock (~> 1.8.7)
|
data/README.md
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
|
-
require "
|
3
|
+
require "addressable/uri"
|
4
|
+
require "faraday"
|
5
|
+
require "faraday_middleware"
|
4
6
|
|
5
7
|
$: << File.dirname(__FILE__)
|
6
8
|
|
@@ -18,6 +20,7 @@ module Scrape
|
|
18
20
|
autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
|
19
21
|
|
20
22
|
class FileNotFound < Exception; end
|
23
|
+
class HTTPError < StandardError; end
|
21
24
|
|
22
25
|
class << self
|
23
26
|
attr_writer :user_agent
|
@@ -38,9 +41,26 @@ module Scrape
|
|
38
41
|
Application.new path
|
39
42
|
end
|
40
43
|
|
41
|
-
def open url, headers =
|
42
|
-
|
43
|
-
|
44
|
+
def open url, headers = nil, &block
|
45
|
+
url = Addressable::URI.parse url
|
46
|
+
headers ||= {}
|
47
|
+
|
48
|
+
conn = Faraday.new :url => url.to_s do |faraday|
|
49
|
+
faraday.response :follow_redirects, :cookies => :all, :limit => 3
|
50
|
+
faraday.adapter Faraday.default_adapter
|
51
|
+
end
|
52
|
+
|
53
|
+
conn.headers[:user_agent] = user_agent
|
54
|
+
|
55
|
+
res = conn.get url.request_uri do |req|
|
56
|
+
headers.each{|key, val| req[key] = val }
|
57
|
+
end
|
58
|
+
|
59
|
+
if res.success?
|
60
|
+
res.body
|
61
|
+
else
|
62
|
+
raise HTTPError, res.status
|
63
|
+
end
|
44
64
|
end
|
45
65
|
end
|
46
66
|
end
|
data/lib/scrape/application.rb
CHANGED
@@ -3,38 +3,35 @@ class Scrape::Application
|
|
3
3
|
|
4
4
|
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
|
-
@options = options
|
6
|
+
@options = options.dup
|
7
7
|
@loader = loader.class == Class ? loader.new(self) : loader
|
8
8
|
@sites = {}
|
9
|
-
|
10
|
-
@history = []
|
9
|
+
reset
|
11
10
|
end
|
12
11
|
|
13
12
|
def run
|
14
13
|
load_scrapefile
|
15
14
|
|
15
|
+
@queue = sites.values.map{|site| site.to_s } if @queue.empty?
|
16
|
+
|
16
17
|
while url = @queue.shift
|
17
18
|
@history << url
|
18
|
-
|
19
|
-
if
|
20
|
-
|
21
|
-
|
22
|
-
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
23
|
-
else
|
24
|
-
Scrape.logger.info "Parsed #{url}."
|
25
|
-
end
|
19
|
+
if site = self[url]
|
20
|
+
if urls = site.parse(url)
|
21
|
+
enqueue *urls
|
22
|
+
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
26
23
|
else
|
27
|
-
Scrape.logger.info "
|
24
|
+
Scrape.logger.info "Parsed #{url}."
|
28
25
|
end
|
29
|
-
|
30
|
-
Scrape.logger.info "
|
26
|
+
else
|
27
|
+
Scrape.logger.info "No rules defined for #{url}"
|
31
28
|
end
|
32
29
|
end
|
33
30
|
end
|
34
31
|
|
35
32
|
def reset
|
36
33
|
@history = []
|
37
|
-
@queue =
|
34
|
+
@queue = []
|
38
35
|
end
|
39
36
|
|
40
37
|
def queue
|
@@ -54,7 +51,7 @@ class Scrape::Application
|
|
54
51
|
def add_site site, options = {}
|
55
52
|
case site
|
56
53
|
when String
|
57
|
-
site = Scrape::Site.new site, options
|
54
|
+
site = Scrape::Site.new site, options.dup
|
58
55
|
@sites.update site.to_s => site
|
59
56
|
site
|
60
57
|
end
|
@@ -63,7 +60,6 @@ class Scrape::Application
|
|
63
60
|
def load_scrapefile
|
64
61
|
return if @scrapefile_loaded
|
65
62
|
loader.load(scrapefile)
|
66
|
-
reset
|
67
63
|
@scrapefile_loaded = true
|
68
64
|
end
|
69
65
|
end
|
data/lib/scrape/dsl.rb
CHANGED
data/lib/scrape/robots_txt.rb
CHANGED
@@ -29,6 +29,7 @@ class Scrape::RobotsTxt
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.parse content
|
32
|
+
return if content.nil?
|
32
33
|
rules, user_agent = Hash.new, nil
|
33
34
|
|
34
35
|
content.split("\n").each do |line|
|
@@ -49,7 +50,8 @@ class Scrape::RobotsTxt
|
|
49
50
|
def self.load url, default = true
|
50
51
|
url = Addressable::URI.join(url, "/robots.txt") if default
|
51
52
|
parse Scrape.open(url)
|
52
|
-
rescue
|
53
|
+
rescue Scrape::HTTPError
|
54
|
+
Scrape.logger.warn "Failed to obtain robots.txt: #{url}"
|
53
55
|
nil
|
54
56
|
end
|
55
57
|
public :load
|
data/lib/scrape/site.rb
CHANGED
@@ -20,7 +20,7 @@ class Scrape::Site
|
|
20
20
|
|
21
21
|
def open url
|
22
22
|
headers = Hash.new
|
23
|
-
headers[
|
23
|
+
headers[:cookie] = cookie if options[:cookie]
|
24
24
|
Scrape.open url, headers
|
25
25
|
end
|
26
26
|
|
@@ -31,6 +31,9 @@ class Scrape::Site
|
|
31
31
|
@matches.each{|match| match.invoke doc, url if match =~ url }
|
32
32
|
|
33
33
|
doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
|
34
|
+
rescue Scrape::HTTPError => e
|
35
|
+
Scrape.logger.info "Error loading #{url}: #{e.message}"
|
36
|
+
nil
|
34
37
|
end
|
35
38
|
|
36
39
|
def accept? url
|
@@ -55,4 +58,18 @@ private
|
|
55
58
|
def disallowed? url
|
56
59
|
!options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
|
57
60
|
end
|
61
|
+
|
62
|
+
def cookie
|
63
|
+
cookie = options[:cookie]
|
64
|
+
case cookie
|
65
|
+
when Hash
|
66
|
+
cookie.map{|name, val| "#{encode(name)}=#{encode(val)}" }.join("; ")
|
67
|
+
when String
|
68
|
+
cookie
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def encode str
|
73
|
+
str.to_s.gsub(" ", "%20").gsub(",", "%2C").gsub(";", "%3B")
|
74
|
+
end
|
58
75
|
end
|
data/lib/scrape/version.rb
CHANGED
data/scrape.gemspec
CHANGED
@@ -20,4 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
|
21
21
|
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
22
22
|
s.add_development_dependency "addressable", "~> 2.2.8"
|
23
|
+
s.add_development_dependency "faraday", "~> 0.8.0"
|
24
|
+
s.add_development_dependency "faraday_middleware", "~> 0.8.8"
|
23
25
|
end
|
data/test/test_helper.rb
CHANGED
@@ -8,6 +8,9 @@ Bundler.setup(:default, :test)
|
|
8
8
|
|
9
9
|
require "scrape"
|
10
10
|
|
11
|
+
# surpress log messages while we're testing
|
12
|
+
Scrape.logger = Class.new{ def method_missing name, *args; end }.new
|
13
|
+
|
11
14
|
class Scrape::TestCase < MiniTest::Unit::TestCase
|
12
15
|
class << self
|
13
16
|
def test name, &block
|
@@ -18,37 +18,33 @@ class ApplicationTest < Scrape::TestCase
|
|
18
18
|
end
|
19
19
|
|
20
20
|
test "#[] should return the site that matches the given url" do
|
21
|
-
site1 = Scrape::Site.new "http://example.com"
|
22
|
-
site2 = Scrape::Site.new "http://example.org"
|
23
21
|
app = Scrape::Application.new(".")
|
24
|
-
|
22
|
+
site1 = app.add_site "http://example.com"
|
23
|
+
app.add_site "http://example.org"
|
25
24
|
assert_equal site1, app["http://example.com"]
|
26
25
|
end
|
27
26
|
|
28
27
|
test "#[] should return the site that is relative to the given url" do
|
29
|
-
site1 = Scrape::Site.new "http://example.com"
|
30
|
-
site2 = Scrape::Site.new "http://example.org"
|
31
28
|
app = Scrape::Application.new(".")
|
32
|
-
|
29
|
+
site1 = app.add_site "http://example.com"
|
30
|
+
app.add_site "http://example.org"
|
33
31
|
assert_equal site1, app["http://example.com/test"]
|
34
32
|
end
|
35
33
|
|
36
34
|
test "#[] should return nil when no site matches the given url" do
|
37
|
-
site1 = Scrape::Site.new "http://example.com"
|
38
|
-
site2 = Scrape::Site.new "http://example.org"
|
39
35
|
app = Scrape::Application.new(".")
|
40
|
-
app.
|
36
|
+
app.add_site "http://example.com"
|
37
|
+
app.add_site "http://example.org"
|
41
38
|
assert_nil app["http://example.net"]
|
42
39
|
end
|
43
40
|
|
44
|
-
test "#reset should enqueue the sites that have been defined" do
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end
|
41
|
+
# test "#reset should enqueue the sites that have been defined" do
|
42
|
+
# app = Scrape::Application.new(".")
|
43
|
+
# app.add_site "http://example.com"
|
44
|
+
# app.add_site "http://example.org"
|
45
|
+
# app.reset
|
46
|
+
# assert_equal ["http://example.com", "http://example.org"], app.queue
|
47
|
+
# end
|
52
48
|
|
53
49
|
test "#run should load the specified file" do
|
54
50
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
data/test/unit/dsl_test.rb
CHANGED
@@ -40,4 +40,11 @@ class DSLTest < Scrape::TestCase
|
|
40
40
|
dsl.match("test"){|*args|}
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
test "#enqueue should add the specified urls to the queue" do
|
45
|
+
app = Scrape::Application.new(".")
|
46
|
+
dsl = Scrape::DSL.new app
|
47
|
+
dsl.enqueue "http://example.com"
|
48
|
+
assert_equal ["http://example.com"], app.queue
|
49
|
+
end
|
43
50
|
end
|
data/test/unit/scrape_test.rb
CHANGED
@@ -1,25 +1,38 @@
|
|
1
1
|
require "test_helper"
|
2
2
|
|
3
3
|
class ScrapeTest < Scrape::TestCase
|
4
|
-
test "
|
4
|
+
test ".user_agent should return default when not set" do
|
5
5
|
assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
|
6
6
|
end
|
7
7
|
|
8
|
-
test "
|
8
|
+
test ".load_scrapefile should return a new application" do
|
9
9
|
app = Scrape.load_scrapefile '.'
|
10
10
|
assert_kind_of Scrape::Application, app
|
11
11
|
end
|
12
12
|
|
13
|
-
test "
|
14
|
-
stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
|
15
|
-
assert_equal "booyah", Scrape.open("http://example.com")
|
16
|
-
end
|
17
|
-
|
18
|
-
test "#open should set the user agent in the request header" do
|
13
|
+
test ".open should set the user agent in the request header" do
|
19
14
|
stub_request(:get, "http://example.com/").
|
20
15
|
with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
|
21
16
|
to_return(:status => 200, :body => "")
|
22
17
|
Scrape.open("http://example.com")
|
23
18
|
assert true
|
24
19
|
end
|
20
|
+
|
21
|
+
test ".open should redirect when response is indicates redirection" do
|
22
|
+
stub_request(:get, "http://example.com/foo").
|
23
|
+
to_return(:status => 301, :headers => {:location => "http://example.com/bar"})
|
24
|
+
stub_request(:get, "http://example.com/bar").
|
25
|
+
to_return(:status => 200, :body => "booyah")
|
26
|
+
Scrape.open("http://example.com/foo")
|
27
|
+
assert true
|
28
|
+
end
|
29
|
+
|
30
|
+
test ".open should raise error when not successful" do
|
31
|
+
stub_request(:get, "http://example.com/").
|
32
|
+
to_return(:status => 404, :body => "")
|
33
|
+
|
34
|
+
assert_raises Scrape::HTTPError do
|
35
|
+
Scrape.open("http://example.com")
|
36
|
+
end
|
37
|
+
end
|
25
38
|
end
|
data/test/unit/site_test.rb
CHANGED
@@ -9,13 +9,22 @@ class SiteTest < Scrape::TestCase
|
|
9
9
|
|
10
10
|
test "#open should include cookie header when cookie option is set" do
|
11
11
|
stub_request(:get, "http://www.example.com/").
|
12
|
-
with(:headers => {'
|
12
|
+
with(:headers => {'Cookie' => 'omnom'}).
|
13
13
|
to_return(:status => 200, :body => "")
|
14
14
|
|
15
15
|
site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
|
16
16
|
site.open "http://www.example.com"
|
17
17
|
end
|
18
18
|
|
19
|
+
test "#open should include cookie header when cookie option is a hash" do
|
20
|
+
stub_request(:get, "http://www.example.com/").
|
21
|
+
with(:headers => {'Cookie' => 'foo=bar'}).
|
22
|
+
to_return(:status => 200, :body => "")
|
23
|
+
|
24
|
+
site = Scrape::Site.new "http://www.example.com", :cookie => {:foo => "bar"}
|
25
|
+
site.open "http://www.example.com"
|
26
|
+
end
|
27
|
+
|
19
28
|
test "#parse should return absolute urls that match the site's url" do
|
20
29
|
stub_request(:get, "http://www.example.com/test").
|
21
30
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: 2.2.8
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: faraday
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.8.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.8.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: faraday_middleware
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.8.8
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.8.8
|
46
78
|
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
47
79
|
email:
|
48
80
|
- evilmarty@gmail.com
|