scrape 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +8 -1
- data/README.md +2 -0
- data/lib/scrape.rb +24 -4
- data/lib/scrape/application.rb +13 -17
- data/lib/scrape/core_ext/array.rb +1 -1
- data/lib/scrape/core_ext/string.rb +1 -1
- data/lib/scrape/dsl.rb +5 -0
- data/lib/scrape/robots_txt.rb +3 -1
- data/lib/scrape/site.rb +18 -1
- data/lib/scrape/version.rb +1 -1
- data/scrape.gemspec +2 -0
- data/test/test_helper.rb +3 -0
- data/test/unit/application_test.rb +13 -17
- data/test/unit/dsl_test.rb +7 -0
- data/test/unit/scrape_test.rb +21 -8
- data/test/unit/site_test.rb +10 -1
- metadata +34 -2
data/Gemfile.lock
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scrape (0.
|
4
|
+
scrape (0.3.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
9
|
addressable (2.2.8)
|
10
10
|
crack (0.3.1)
|
11
|
+
faraday (0.8.1)
|
12
|
+
multipart-post (~> 1.1)
|
13
|
+
faraday_middleware (0.8.8)
|
14
|
+
faraday (>= 0.7.4, < 0.9)
|
15
|
+
multipart-post (1.1.5)
|
11
16
|
nokogiri (1.5.5)
|
12
17
|
webmock (1.8.7)
|
13
18
|
addressable (>= 2.2.7)
|
@@ -18,6 +23,8 @@ PLATFORMS
|
|
18
23
|
|
19
24
|
DEPENDENCIES
|
20
25
|
addressable (~> 2.2.8)
|
26
|
+
faraday (~> 0.8.0)
|
27
|
+
faraday_middleware (~> 0.8.8)
|
21
28
|
nokogiri (~> 1.5.5)
|
22
29
|
scrape!
|
23
30
|
webmock (~> 1.8.7)
|
data/README.md
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
|
-
require "
|
3
|
+
require "addressable/uri"
|
4
|
+
require "faraday"
|
5
|
+
require "faraday_middleware"
|
4
6
|
|
5
7
|
$: << File.dirname(__FILE__)
|
6
8
|
|
@@ -18,6 +20,7 @@ module Scrape
|
|
18
20
|
autoload 'RobotsTxtRules', 'scrape/robots_txt_rules'
|
19
21
|
|
20
22
|
class FileNotFound < Exception; end
|
23
|
+
class HTTPError < StandardError; end
|
21
24
|
|
22
25
|
class << self
|
23
26
|
attr_writer :user_agent
|
@@ -38,9 +41,26 @@ module Scrape
|
|
38
41
|
Application.new path
|
39
42
|
end
|
40
43
|
|
41
|
-
def open url, headers =
|
42
|
-
|
43
|
-
|
44
|
+
def open url, headers = nil, &block
|
45
|
+
url = Addressable::URI.parse url
|
46
|
+
headers ||= {}
|
47
|
+
|
48
|
+
conn = Faraday.new :url => url.to_s do |faraday|
|
49
|
+
faraday.response :follow_redirects, :cookies => :all, :limit => 3
|
50
|
+
faraday.adapter Faraday.default_adapter
|
51
|
+
end
|
52
|
+
|
53
|
+
conn.headers[:user_agent] = user_agent
|
54
|
+
|
55
|
+
res = conn.get url.request_uri do |req|
|
56
|
+
headers.each{|key, val| req[key] = val }
|
57
|
+
end
|
58
|
+
|
59
|
+
if res.success?
|
60
|
+
res.body
|
61
|
+
else
|
62
|
+
raise HTTPError, res.status
|
63
|
+
end
|
44
64
|
end
|
45
65
|
end
|
46
66
|
end
|
data/lib/scrape/application.rb
CHANGED
@@ -3,38 +3,35 @@ class Scrape::Application
|
|
3
3
|
|
4
4
|
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
|
-
@options = options
|
6
|
+
@options = options.dup
|
7
7
|
@loader = loader.class == Class ? loader.new(self) : loader
|
8
8
|
@sites = {}
|
9
|
-
|
10
|
-
@history = []
|
9
|
+
reset
|
11
10
|
end
|
12
11
|
|
13
12
|
def run
|
14
13
|
load_scrapefile
|
15
14
|
|
15
|
+
@queue = sites.values.map{|site| site.to_s } if @queue.empty?
|
16
|
+
|
16
17
|
while url = @queue.shift
|
17
18
|
@history << url
|
18
|
-
|
19
|
-
if
|
20
|
-
|
21
|
-
|
22
|
-
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
23
|
-
else
|
24
|
-
Scrape.logger.info "Parsed #{url}."
|
25
|
-
end
|
19
|
+
if site = self[url]
|
20
|
+
if urls = site.parse(url)
|
21
|
+
enqueue *urls
|
22
|
+
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
26
23
|
else
|
27
|
-
Scrape.logger.info "
|
24
|
+
Scrape.logger.info "Parsed #{url}."
|
28
25
|
end
|
29
|
-
|
30
|
-
Scrape.logger.info "
|
26
|
+
else
|
27
|
+
Scrape.logger.info "No rules defined for #{url}"
|
31
28
|
end
|
32
29
|
end
|
33
30
|
end
|
34
31
|
|
35
32
|
def reset
|
36
33
|
@history = []
|
37
|
-
@queue =
|
34
|
+
@queue = []
|
38
35
|
end
|
39
36
|
|
40
37
|
def queue
|
@@ -54,7 +51,7 @@ class Scrape::Application
|
|
54
51
|
def add_site site, options = {}
|
55
52
|
case site
|
56
53
|
when String
|
57
|
-
site = Scrape::Site.new site, options
|
54
|
+
site = Scrape::Site.new site, options.dup
|
58
55
|
@sites.update site.to_s => site
|
59
56
|
site
|
60
57
|
end
|
@@ -63,7 +60,6 @@ class Scrape::Application
|
|
63
60
|
def load_scrapefile
|
64
61
|
return if @scrapefile_loaded
|
65
62
|
loader.load(scrapefile)
|
66
|
-
reset
|
67
63
|
@scrapefile_loaded = true
|
68
64
|
end
|
69
65
|
end
|
data/lib/scrape/dsl.rb
CHANGED
data/lib/scrape/robots_txt.rb
CHANGED
@@ -29,6 +29,7 @@ class Scrape::RobotsTxt
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.parse content
|
32
|
+
return if content.nil?
|
32
33
|
rules, user_agent = Hash.new, nil
|
33
34
|
|
34
35
|
content.split("\n").each do |line|
|
@@ -49,7 +50,8 @@ class Scrape::RobotsTxt
|
|
49
50
|
def self.load url, default = true
|
50
51
|
url = Addressable::URI.join(url, "/robots.txt") if default
|
51
52
|
parse Scrape.open(url)
|
52
|
-
rescue
|
53
|
+
rescue Scrape::HTTPError
|
54
|
+
Scrape.logger.warn "Failed to obtain robots.txt: #{url}"
|
53
55
|
nil
|
54
56
|
end
|
55
57
|
public :load
|
data/lib/scrape/site.rb
CHANGED
@@ -20,7 +20,7 @@ class Scrape::Site
|
|
20
20
|
|
21
21
|
def open url
|
22
22
|
headers = Hash.new
|
23
|
-
headers[
|
23
|
+
headers[:cookie] = cookie if options[:cookie]
|
24
24
|
Scrape.open url, headers
|
25
25
|
end
|
26
26
|
|
@@ -31,6 +31,9 @@ class Scrape::Site
|
|
31
31
|
@matches.each{|match| match.invoke doc, url if match =~ url }
|
32
32
|
|
33
33
|
doc.css("a[href]").map{|node| normalize node['href'], url }.select{|url| accept? url }
|
34
|
+
rescue Scrape::HTTPError => e
|
35
|
+
Scrape.logger.info "Error loading #{url}: #{e.message}"
|
36
|
+
nil
|
34
37
|
end
|
35
38
|
|
36
39
|
def accept? url
|
@@ -55,4 +58,18 @@ private
|
|
55
58
|
def disallowed? url
|
56
59
|
!options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
|
57
60
|
end
|
61
|
+
|
62
|
+
def cookie
|
63
|
+
cookie = options[:cookie]
|
64
|
+
case cookie
|
65
|
+
when Hash
|
66
|
+
cookie.map{|name, val| "#{encode(name)}=#{encode(val)}" }.join("; ")
|
67
|
+
when String
|
68
|
+
cookie
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def encode str
|
73
|
+
str.to_s.gsub(" ", "%20").gsub(",", "%2C").gsub(";", "%3B")
|
74
|
+
end
|
58
75
|
end
|
data/lib/scrape/version.rb
CHANGED
data/scrape.gemspec
CHANGED
@@ -20,4 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
|
21
21
|
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
22
22
|
s.add_development_dependency "addressable", "~> 2.2.8"
|
23
|
+
s.add_development_dependency "faraday", "~> 0.8.0"
|
24
|
+
s.add_development_dependency "faraday_middleware", "~> 0.8.8"
|
23
25
|
end
|
data/test/test_helper.rb
CHANGED
@@ -8,6 +8,9 @@ Bundler.setup(:default, :test)
|
|
8
8
|
|
9
9
|
require "scrape"
|
10
10
|
|
11
|
+
# surpress log messages while we're testing
|
12
|
+
Scrape.logger = Class.new{ def method_missing name, *args; end }.new
|
13
|
+
|
11
14
|
class Scrape::TestCase < MiniTest::Unit::TestCase
|
12
15
|
class << self
|
13
16
|
def test name, &block
|
@@ -18,37 +18,33 @@ class ApplicationTest < Scrape::TestCase
|
|
18
18
|
end
|
19
19
|
|
20
20
|
test "#[] should return the site that matches the given url" do
|
21
|
-
site1 = Scrape::Site.new "http://example.com"
|
22
|
-
site2 = Scrape::Site.new "http://example.org"
|
23
21
|
app = Scrape::Application.new(".")
|
24
|
-
|
22
|
+
site1 = app.add_site "http://example.com"
|
23
|
+
app.add_site "http://example.org"
|
25
24
|
assert_equal site1, app["http://example.com"]
|
26
25
|
end
|
27
26
|
|
28
27
|
test "#[] should return the site that is relative to the given url" do
|
29
|
-
site1 = Scrape::Site.new "http://example.com"
|
30
|
-
site2 = Scrape::Site.new "http://example.org"
|
31
28
|
app = Scrape::Application.new(".")
|
32
|
-
|
29
|
+
site1 = app.add_site "http://example.com"
|
30
|
+
app.add_site "http://example.org"
|
33
31
|
assert_equal site1, app["http://example.com/test"]
|
34
32
|
end
|
35
33
|
|
36
34
|
test "#[] should return nil when no site matches the given url" do
|
37
|
-
site1 = Scrape::Site.new "http://example.com"
|
38
|
-
site2 = Scrape::Site.new "http://example.org"
|
39
35
|
app = Scrape::Application.new(".")
|
40
|
-
app.
|
36
|
+
app.add_site "http://example.com"
|
37
|
+
app.add_site "http://example.org"
|
41
38
|
assert_nil app["http://example.net"]
|
42
39
|
end
|
43
40
|
|
44
|
-
test "#reset should enqueue the sites that have been defined" do
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end
|
41
|
+
# test "#reset should enqueue the sites that have been defined" do
|
42
|
+
# app = Scrape::Application.new(".")
|
43
|
+
# app.add_site "http://example.com"
|
44
|
+
# app.add_site "http://example.org"
|
45
|
+
# app.reset
|
46
|
+
# assert_equal ["http://example.com", "http://example.org"], app.queue
|
47
|
+
# end
|
52
48
|
|
53
49
|
test "#run should load the specified file" do
|
54
50
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
data/test/unit/dsl_test.rb
CHANGED
@@ -40,4 +40,11 @@ class DSLTest < Scrape::TestCase
|
|
40
40
|
dsl.match("test"){|*args|}
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
test "#enqueue should add the specified urls to the queue" do
|
45
|
+
app = Scrape::Application.new(".")
|
46
|
+
dsl = Scrape::DSL.new app
|
47
|
+
dsl.enqueue "http://example.com"
|
48
|
+
assert_equal ["http://example.com"], app.queue
|
49
|
+
end
|
43
50
|
end
|
data/test/unit/scrape_test.rb
CHANGED
@@ -1,25 +1,38 @@
|
|
1
1
|
require "test_helper"
|
2
2
|
|
3
3
|
class ScrapeTest < Scrape::TestCase
|
4
|
-
test "
|
4
|
+
test ".user_agent should return default when not set" do
|
5
5
|
assert_equal Scrape.user_agent, "Scrape/#{Scrape::VERSION}"
|
6
6
|
end
|
7
7
|
|
8
|
-
test "
|
8
|
+
test ".load_scrapefile should return a new application" do
|
9
9
|
app = Scrape.load_scrapefile '.'
|
10
10
|
assert_kind_of Scrape::Application, app
|
11
11
|
end
|
12
12
|
|
13
|
-
test "
|
14
|
-
stub_request(:get, "http://example.com/").to_return(:status => 200, :body => "booyah")
|
15
|
-
assert_equal "booyah", Scrape.open("http://example.com")
|
16
|
-
end
|
17
|
-
|
18
|
-
test "#open should set the user agent in the request header" do
|
13
|
+
test ".open should set the user agent in the request header" do
|
19
14
|
stub_request(:get, "http://example.com/").
|
20
15
|
with(:headers => {"User-Agent" => "Scrape/#{Scrape::VERSION}"}).
|
21
16
|
to_return(:status => 200, :body => "")
|
22
17
|
Scrape.open("http://example.com")
|
23
18
|
assert true
|
24
19
|
end
|
20
|
+
|
21
|
+
test ".open should redirect when response is indicates redirection" do
|
22
|
+
stub_request(:get, "http://example.com/foo").
|
23
|
+
to_return(:status => 301, :headers => {:location => "http://example.com/bar"})
|
24
|
+
stub_request(:get, "http://example.com/bar").
|
25
|
+
to_return(:status => 200, :body => "booyah")
|
26
|
+
Scrape.open("http://example.com/foo")
|
27
|
+
assert true
|
28
|
+
end
|
29
|
+
|
30
|
+
test ".open should raise error when not successful" do
|
31
|
+
stub_request(:get, "http://example.com/").
|
32
|
+
to_return(:status => 404, :body => "")
|
33
|
+
|
34
|
+
assert_raises Scrape::HTTPError do
|
35
|
+
Scrape.open("http://example.com")
|
36
|
+
end
|
37
|
+
end
|
25
38
|
end
|
data/test/unit/site_test.rb
CHANGED
@@ -9,13 +9,22 @@ class SiteTest < Scrape::TestCase
|
|
9
9
|
|
10
10
|
test "#open should include cookie header when cookie option is set" do
|
11
11
|
stub_request(:get, "http://www.example.com/").
|
12
|
-
with(:headers => {'
|
12
|
+
with(:headers => {'Cookie' => 'omnom'}).
|
13
13
|
to_return(:status => 200, :body => "")
|
14
14
|
|
15
15
|
site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
|
16
16
|
site.open "http://www.example.com"
|
17
17
|
end
|
18
18
|
|
19
|
+
test "#open should include cookie header when cookie option is a hash" do
|
20
|
+
stub_request(:get, "http://www.example.com/").
|
21
|
+
with(:headers => {'Cookie' => 'foo=bar'}).
|
22
|
+
to_return(:status => 200, :body => "")
|
23
|
+
|
24
|
+
site = Scrape::Site.new "http://www.example.com", :cookie => {:foo => "bar"}
|
25
|
+
site.open "http://www.example.com"
|
26
|
+
end
|
27
|
+
|
19
28
|
test "#parse should return absolute urls that match the site's url" do
|
20
29
|
stub_request(:get, "http://www.example.com/test").
|
21
30
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: 2.2.8
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: faraday
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.8.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.8.0
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: faraday_middleware
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.8.8
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.8.8
|
46
78
|
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
47
79
|
email:
|
48
80
|
- evilmarty@gmail.com
|