scrape 0.2.2 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -1
- data/Gemfile.lock +1 -1
- data/bin/scrape +1 -3
- data/lib/scrape/application.rb +25 -21
- data/lib/scrape/cli.rb +1 -1
- data/lib/scrape/core_ext/array.rb +5 -0
- data/lib/scrape/{string_ext.rb → core_ext/string.rb} +0 -0
- data/lib/scrape/default_loader.rb +7 -14
- data/lib/scrape/dsl.rb +11 -6
- data/lib/scrape/robots_txt.rb +2 -0
- data/lib/scrape/site.rb +11 -10
- data/lib/scrape/version.rb +1 -1
- data/lib/scrape.rb +5 -4
- data/scrape.gemspec +1 -1
- data/test/unit/application_test.rb +4 -6
- data/test/unit/default_loader_test.rb +17 -13
- data/test/unit/dsl_test.rb +43 -0
- data/test/unit/robots_txt_test.rb +8 -0
- data/test/unit/site_test.rb +9 -0
- metadata +6 -3
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/bin/scrape
CHANGED
data/lib/scrape/application.rb
CHANGED
@@ -1,30 +1,33 @@
|
|
1
1
|
class Scrape::Application
|
2
|
-
attr_reader :scrapefile, :loader, :sites, :history, :
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history, :options
|
3
3
|
|
4
|
-
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
4
|
+
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
|
-
@
|
6
|
+
@options = options
|
7
|
+
@loader = loader.class == Class ? loader.new(self) : loader
|
7
8
|
@sites = {}
|
8
9
|
@queue = []
|
9
10
|
@history = []
|
10
|
-
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
|
11
11
|
end
|
12
12
|
|
13
13
|
def run
|
14
14
|
load_scrapefile
|
15
15
|
|
16
16
|
while url = @queue.shift
|
17
|
-
Scrape.logger.info "Loading: #{url}..."
|
18
17
|
@history << url
|
19
|
-
|
20
|
-
if
|
21
|
-
|
22
|
-
|
18
|
+
begin
|
19
|
+
if site = self[url]
|
20
|
+
if urls = site.parse(url)
|
21
|
+
enqueue *urls
|
22
|
+
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
23
|
+
else
|
24
|
+
Scrape.logger.info "Parsed #{url}."
|
25
|
+
end
|
23
26
|
else
|
24
|
-
Scrape.logger.info "
|
27
|
+
Scrape.logger.info "No rules defined for #{url}"
|
25
28
|
end
|
26
|
-
|
27
|
-
Scrape.logger.info "
|
29
|
+
rescue OpenURI::HTTPError => e
|
30
|
+
Scrape.logger.info "Error loading #{url}: #{e.message}"
|
28
31
|
end
|
29
32
|
end
|
30
33
|
end
|
@@ -44,21 +47,22 @@ class Scrape::Application
|
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
47
|
-
def ignore_robots_txt= bool
|
48
|
-
sites.each{|_, site| site.ignore_robots_txt = bool }
|
49
|
-
@ignore_robots_txt = bool
|
50
|
-
end
|
51
|
-
|
52
50
|
def [] url
|
53
51
|
@sites.values.detect{|site| site.accept? url }
|
54
52
|
end
|
55
53
|
|
54
|
+
def add_site site, options = {}
|
55
|
+
case site
|
56
|
+
when String
|
57
|
+
site = Scrape::Site.new site, options
|
58
|
+
@sites.update site.to_s => site
|
59
|
+
site
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
56
63
|
def load_scrapefile
|
57
64
|
return if @scrapefile_loaded
|
58
|
-
|
59
|
-
result = loader.load scrapefile
|
60
|
-
@sites.update result if result.is_a? Hash
|
61
|
-
self.ignore_robots_txt = ignore_robots_txt
|
65
|
+
loader.load(scrapefile)
|
62
66
|
reset
|
63
67
|
@scrapefile_loaded = true
|
64
68
|
end
|
data/lib/scrape/cli.rb
CHANGED
File without changes
|
@@ -1,19 +1,12 @@
|
|
1
1
|
class Scrape::DefaultLoader
|
2
|
-
def
|
3
|
-
|
4
|
-
sites = {}
|
5
|
-
|
6
|
-
sandbox = Sandbox.new sites
|
7
|
-
sandbox.instance_eval File.read(path), path
|
8
|
-
|
9
|
-
sites
|
2
|
+
def initialize app
|
3
|
+
@app = app
|
10
4
|
end
|
11
5
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
6
|
+
def load path
|
7
|
+
path = File.expand_path path
|
8
|
+
File.exists? path or raise Scrape::FileNotFound, path
|
9
|
+
dsl = Scrape::DSL.new @app
|
10
|
+
dsl.instance_eval File.read(path), path
|
18
11
|
end
|
19
12
|
end
|
data/lib/scrape/dsl.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
-
|
1
|
+
class Scrape::DSL
|
2
|
+
def initialize app
|
3
|
+
@application = app
|
4
|
+
end
|
5
|
+
|
2
6
|
def site *urls
|
3
|
-
@
|
4
|
-
|
5
|
-
|
7
|
+
return @sites if urls.empty?
|
8
|
+
urls = urls.flatten
|
9
|
+
options = urls.extract_options!
|
10
|
+
@sites = urls.map{|url| @application.sites[url] || @application.add_site(url, options) }
|
6
11
|
end
|
7
12
|
|
8
13
|
def match matcher, &proc
|
9
|
-
raise ArgumentError
|
10
|
-
matches = @
|
14
|
+
raise ArgumentError, "No sites have been defined" unless defined? @sites
|
15
|
+
matches = @sites.map{|site| site.add_match matcher, &proc }
|
11
16
|
matches.size == 1 ? matches.first : matches
|
12
17
|
end
|
13
18
|
end
|
data/lib/scrape/robots_txt.rb
CHANGED
data/lib/scrape/site.rb
CHANGED
@@ -2,15 +2,14 @@ require 'addressable/uri'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
class Scrape::Site
|
5
|
-
attr_reader :url, :matches
|
6
|
-
attr_accessor :ignore_robots_txt
|
5
|
+
attr_reader :url, :matches, :options
|
7
6
|
|
8
7
|
def initialize url, options = {}
|
9
8
|
@url = Addressable::URI.parse url
|
10
9
|
@url.query = nil
|
11
10
|
@url.fragment = nil
|
11
|
+
@options = {:ignore_robots_txt => true}.merge options
|
12
12
|
@matches = []
|
13
|
-
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
|
14
13
|
end
|
15
14
|
|
16
15
|
def add_match matcher, &proc
|
@@ -19,9 +18,15 @@ class Scrape::Site
|
|
19
18
|
match
|
20
19
|
end
|
21
20
|
|
21
|
+
def open url
|
22
|
+
headers = Hash.new
|
23
|
+
headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
|
24
|
+
Scrape.open url, headers
|
25
|
+
end
|
26
|
+
|
22
27
|
def parse url
|
23
28
|
url = normalize url
|
24
|
-
doc = Nokogiri::HTML
|
29
|
+
doc = Nokogiri::HTML open(url)
|
25
30
|
|
26
31
|
@matches.each{|match| match.invoke doc, url if match =~ url }
|
27
32
|
|
@@ -38,11 +43,7 @@ class Scrape::Site
|
|
38
43
|
end
|
39
44
|
|
40
45
|
def robots_txt
|
41
|
-
@robots_txt
|
42
|
-
end
|
43
|
-
|
44
|
-
def ignore_robots_txt?
|
45
|
-
!!@ignore_robots_txt
|
46
|
+
@robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
|
46
47
|
end
|
47
48
|
|
48
49
|
def to_s
|
@@ -52,6 +53,6 @@ class Scrape::Site
|
|
52
53
|
private
|
53
54
|
|
54
55
|
def disallowed? url
|
55
|
-
!ignore_robots_txt
|
56
|
+
!options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
|
56
57
|
end
|
57
58
|
end
|
data/lib/scrape/version.rb
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
3
|
require "open-uri"
|
4
|
-
require "bundler/setup"
|
5
4
|
|
6
|
-
|
5
|
+
$: << File.dirname(__FILE__)
|
7
6
|
|
8
|
-
|
9
|
-
|
7
|
+
require "scrape/version"
|
8
|
+
require "scrape/core_ext/array"
|
9
|
+
require "scrape/core_ext/string"
|
10
10
|
|
11
|
+
module Scrape
|
11
12
|
autoload 'Application', 'scrape/application'
|
12
13
|
autoload 'Site', 'scrape/site'
|
13
14
|
autoload 'Match', 'scrape/match'
|
data/scrape.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
17
17
|
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
|
19
|
-
s.require_paths = ["lib"]
|
19
|
+
s.require_paths = ["lib", "lib/scrape"]
|
20
20
|
|
21
21
|
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
22
22
|
s.add_development_dependency "addressable", "~> 2.2.8"
|
@@ -53,6 +53,7 @@ class ApplicationTest < Scrape::TestCase
|
|
53
53
|
test "#run should load the specified file" do
|
54
54
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
55
55
|
test_loader = MiniTest::Mock.new
|
56
|
+
test_loader.expect :class, Scrape::DefaultLoader
|
56
57
|
test_loader.expect :load, nil, [filepath]
|
57
58
|
Scrape::Application.new(filepath, {}, test_loader).run
|
58
59
|
assert test_loader.verify, "loader did not receive file"
|
@@ -70,12 +71,9 @@ class ApplicationTest < Scrape::TestCase
|
|
70
71
|
assert_equal ["http://example.com"], app.queue
|
71
72
|
end
|
72
73
|
|
73
|
-
test "#
|
74
|
-
site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
|
74
|
+
test "#add_site should add the specied string to the collection" do
|
75
75
|
app = Scrape::Application.new(".")
|
76
|
-
app.
|
77
|
-
|
78
|
-
app.ignore_robots_txt = true
|
79
|
-
assert_equal true, site.ignore_robots_txt
|
76
|
+
app.add_site "http://example.com"
|
77
|
+
assert app.sites.member?("http://example.com")
|
80
78
|
end
|
81
79
|
end
|
@@ -3,23 +3,27 @@ require "test_helper"
|
|
3
3
|
class DefaultLoaderTest < Scrape::TestCase
|
4
4
|
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
5
5
|
|
6
|
-
test "#load should
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
test "#load should parse the specified file" do
|
7
|
+
app = Scrape::Application.new "."
|
8
|
+
loader = Scrape::DefaultLoader.new app
|
9
|
+
loader.load File.join(SUPPORT_FILES, "test1.scrape")
|
10
|
+
assert_equal ["http://example.com"], app.sites.keys
|
11
|
+
assert_instance_of Scrape::Site, app.sites.values[0]
|
11
12
|
end
|
12
13
|
|
13
|
-
test "#load should
|
14
|
-
|
15
|
-
|
16
|
-
assert_equal Hash.new, sites
|
17
|
-
end
|
18
|
-
|
19
|
-
test "#load should raise an error when no site is defined" do
|
20
|
-
loader = Scrape::DefaultLoader.new
|
14
|
+
test "#load should raise error when no site is defined" do
|
15
|
+
app = Scrape::Application.new "."
|
16
|
+
loader = Scrape::DefaultLoader.new app
|
21
17
|
assert_raises ArgumentError do
|
22
18
|
loader.load File.join(SUPPORT_FILES, "test3.scrape")
|
23
19
|
end
|
24
20
|
end
|
21
|
+
|
22
|
+
test "#load should raise error when file cannot be found" do
|
23
|
+
app = Scrape::Application.new "."
|
24
|
+
loader = Scrape::DefaultLoader.new app
|
25
|
+
assert_raises Scrape::FileNotFound do
|
26
|
+
loader.load "#{Time.now.to_i}.txt"
|
27
|
+
end
|
28
|
+
end
|
25
29
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class DSLTest < Scrape::TestCase
|
4
|
+
test "#site should add the url to the application" do
|
5
|
+
app = Scrape::Application.new(".")
|
6
|
+
dsl = Scrape::DSL.new app
|
7
|
+
dsl.site "http://example.com"
|
8
|
+
assert app.sites.member?("http://example.com")
|
9
|
+
end
|
10
|
+
|
11
|
+
test "#site should return the currently defined sites" do
|
12
|
+
app = Scrape::Application.new(".")
|
13
|
+
dsl = Scrape::DSL.new app
|
14
|
+
sites = dsl.site "http://example.com"
|
15
|
+
assert_equal "http://example.com", sites[0].to_s
|
16
|
+
assert_equal sites, dsl.site
|
17
|
+
end
|
18
|
+
|
19
|
+
test "#site should pass the options to the site" do
|
20
|
+
app = Scrape::Application.new(".")
|
21
|
+
dsl = Scrape::DSL.new app
|
22
|
+
dsl.site "http://example.com", :test => true
|
23
|
+
assert_equal true, app.sites["http://example.com"].options[:test]
|
24
|
+
end
|
25
|
+
|
26
|
+
test "#match should create a match on the current sites" do
|
27
|
+
app = Scrape::Application.new(".")
|
28
|
+
dsl = Scrape::DSL.new app
|
29
|
+
dsl.site "http://example.com"
|
30
|
+
site = app.sites["http://example.com"]
|
31
|
+
assert_empty site.matches
|
32
|
+
dsl.match("test"){|*args|}
|
33
|
+
refute_empty site.matches
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#match should raise an error when no sites have been defined" do
|
37
|
+
app = Scrape::Application.new(".")
|
38
|
+
dsl = Scrape::DSL.new app
|
39
|
+
assert_raises ArgumentError do
|
40
|
+
dsl.match("test"){|*args|}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -68,4 +68,12 @@ class RobotsTxtTest < Scrape::TestCase
|
|
68
68
|
assert_equal ["Test"], robots.user_agents
|
69
69
|
assert_equal ["/foo", "/bar"], robots.disallows
|
70
70
|
end
|
71
|
+
|
72
|
+
test ".load should return nil when specified url results in 404" do
|
73
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
74
|
+
to_return(:status => 404, :body => "")
|
75
|
+
|
76
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
|
77
|
+
assert_nil robots
|
78
|
+
end
|
71
79
|
end
|
data/test/unit/site_test.rb
CHANGED
@@ -7,6 +7,15 @@ class SiteTest < Scrape::TestCase
|
|
7
7
|
assert_instance_of Scrape::Match, match
|
8
8
|
end
|
9
9
|
|
10
|
+
test "#open should include cookie header when cookie option is set" do
|
11
|
+
stub_request(:get, "http://www.example.com/").
|
12
|
+
with(:headers => {'Set-Cookie'=>'omnom'}).
|
13
|
+
to_return(:status => 200, :body => "")
|
14
|
+
|
15
|
+
site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
|
16
|
+
site.open "http://www.example.com"
|
17
|
+
end
|
18
|
+
|
10
19
|
test "#parse should return absolute urls that match the site's url" do
|
11
20
|
stub_request(:get, "http://www.example.com/test").
|
12
21
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -63,13 +63,14 @@ files:
|
|
63
63
|
- lib/scrape.rb
|
64
64
|
- lib/scrape/application.rb
|
65
65
|
- lib/scrape/cli.rb
|
66
|
+
- lib/scrape/core_ext/array.rb
|
67
|
+
- lib/scrape/core_ext/string.rb
|
66
68
|
- lib/scrape/default_loader.rb
|
67
69
|
- lib/scrape/dsl.rb
|
68
70
|
- lib/scrape/match.rb
|
69
71
|
- lib/scrape/robots_txt.rb
|
70
72
|
- lib/scrape/robots_txt_rules.rb
|
71
73
|
- lib/scrape/site.rb
|
72
|
-
- lib/scrape/string_ext.rb
|
73
74
|
- lib/scrape/version.rb
|
74
75
|
- scrape.gemspec
|
75
76
|
- test/support/test1.scrape
|
@@ -79,6 +80,7 @@ files:
|
|
79
80
|
- test/unit/application_test.rb
|
80
81
|
- test/unit/cli_test.rb
|
81
82
|
- test/unit/default_loader_test.rb
|
83
|
+
- test/unit/dsl_test.rb
|
82
84
|
- test/unit/match_test.rb
|
83
85
|
- test/unit/robots_txt_rules_test.rb
|
84
86
|
- test/unit/robots_txt_test.rb
|
@@ -90,6 +92,7 @@ post_install_message:
|
|
90
92
|
rdoc_options: []
|
91
93
|
require_paths:
|
92
94
|
- lib
|
95
|
+
- lib/scrape
|
93
96
|
required_ruby_version: !ruby/object:Gem::Requirement
|
94
97
|
none: false
|
95
98
|
requirements:
|