scrape 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -1
- data/Gemfile.lock +1 -1
- data/bin/scrape +1 -3
- data/lib/scrape/application.rb +25 -21
- data/lib/scrape/cli.rb +1 -1
- data/lib/scrape/core_ext/array.rb +5 -0
- data/lib/scrape/{string_ext.rb → core_ext/string.rb} +0 -0
- data/lib/scrape/default_loader.rb +7 -14
- data/lib/scrape/dsl.rb +11 -6
- data/lib/scrape/robots_txt.rb +2 -0
- data/lib/scrape/site.rb +11 -10
- data/lib/scrape/version.rb +1 -1
- data/lib/scrape.rb +5 -4
- data/scrape.gemspec +1 -1
- data/test/unit/application_test.rb +4 -6
- data/test/unit/default_loader_test.rb +17 -13
- data/test/unit/dsl_test.rb +43 -0
- data/test/unit/robots_txt_test.rb +8 -0
- data/test/unit/site_test.rb +9 -0
- metadata +6 -3
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/bin/scrape
CHANGED
data/lib/scrape/application.rb
CHANGED
@@ -1,30 +1,33 @@
|
|
1
1
|
class Scrape::Application
|
2
|
-
attr_reader :scrapefile, :loader, :sites, :history, :
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history, :options
|
3
3
|
|
4
|
-
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
4
|
+
def initialize scrapefile, options = {}, loader = Scrape::DefaultLoader
|
5
5
|
@scrapefile = File.expand_path scrapefile
|
6
|
-
@
|
6
|
+
@options = options
|
7
|
+
@loader = loader.class == Class ? loader.new(self) : loader
|
7
8
|
@sites = {}
|
8
9
|
@queue = []
|
9
10
|
@history = []
|
10
|
-
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ false }
|
11
11
|
end
|
12
12
|
|
13
13
|
def run
|
14
14
|
load_scrapefile
|
15
15
|
|
16
16
|
while url = @queue.shift
|
17
|
-
Scrape.logger.info "Loading: #{url}..."
|
18
17
|
@history << url
|
19
|
-
|
20
|
-
if
|
21
|
-
|
22
|
-
|
18
|
+
begin
|
19
|
+
if site = self[url]
|
20
|
+
if urls = site.parse(url)
|
21
|
+
enqueue *urls
|
22
|
+
Scrape.logger.info "Parsed #{url}, found #{urls.length} urls."
|
23
|
+
else
|
24
|
+
Scrape.logger.info "Parsed #{url}."
|
25
|
+
end
|
23
26
|
else
|
24
|
-
Scrape.logger.info "
|
27
|
+
Scrape.logger.info "No rules defined for #{url}"
|
25
28
|
end
|
26
|
-
|
27
|
-
Scrape.logger.info "
|
29
|
+
rescue OpenURI::HTTPError => e
|
30
|
+
Scrape.logger.info "Error loading #{url}: #{e.message}"
|
28
31
|
end
|
29
32
|
end
|
30
33
|
end
|
@@ -44,21 +47,22 @@ class Scrape::Application
|
|
44
47
|
end
|
45
48
|
end
|
46
49
|
|
47
|
-
def ignore_robots_txt= bool
|
48
|
-
sites.each{|_, site| site.ignore_robots_txt = bool }
|
49
|
-
@ignore_robots_txt = bool
|
50
|
-
end
|
51
|
-
|
52
50
|
def [] url
|
53
51
|
@sites.values.detect{|site| site.accept? url }
|
54
52
|
end
|
55
53
|
|
54
|
+
def add_site site, options = {}
|
55
|
+
case site
|
56
|
+
when String
|
57
|
+
site = Scrape::Site.new site, options
|
58
|
+
@sites.update site.to_s => site
|
59
|
+
site
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
56
63
|
def load_scrapefile
|
57
64
|
return if @scrapefile_loaded
|
58
|
-
|
59
|
-
result = loader.load scrapefile
|
60
|
-
@sites.update result if result.is_a? Hash
|
61
|
-
self.ignore_robots_txt = ignore_robots_txt
|
65
|
+
loader.load(scrapefile)
|
62
66
|
reset
|
63
67
|
@scrapefile_loaded = true
|
64
68
|
end
|
data/lib/scrape/cli.rb
CHANGED
File without changes
|
@@ -1,19 +1,12 @@
|
|
1
1
|
class Scrape::DefaultLoader
|
2
|
-
def
|
3
|
-
|
4
|
-
sites = {}
|
5
|
-
|
6
|
-
sandbox = Sandbox.new sites
|
7
|
-
sandbox.instance_eval File.read(path), path
|
8
|
-
|
9
|
-
sites
|
2
|
+
def initialize app
|
3
|
+
@app = app
|
10
4
|
end
|
11
5
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
6
|
+
def load path
|
7
|
+
path = File.expand_path path
|
8
|
+
File.exists? path or raise Scrape::FileNotFound, path
|
9
|
+
dsl = Scrape::DSL.new @app
|
10
|
+
dsl.instance_eval File.read(path), path
|
18
11
|
end
|
19
12
|
end
|
data/lib/scrape/dsl.rb
CHANGED
@@ -1,13 +1,18 @@
|
|
1
|
-
|
1
|
+
class Scrape::DSL
|
2
|
+
def initialize app
|
3
|
+
@application = app
|
4
|
+
end
|
5
|
+
|
2
6
|
def site *urls
|
3
|
-
@
|
4
|
-
|
5
|
-
|
7
|
+
return @sites if urls.empty?
|
8
|
+
urls = urls.flatten
|
9
|
+
options = urls.extract_options!
|
10
|
+
@sites = urls.map{|url| @application.sites[url] || @application.add_site(url, options) }
|
6
11
|
end
|
7
12
|
|
8
13
|
def match matcher, &proc
|
9
|
-
raise ArgumentError
|
10
|
-
matches = @
|
14
|
+
raise ArgumentError, "No sites have been defined" unless defined? @sites
|
15
|
+
matches = @sites.map{|site| site.add_match matcher, &proc }
|
11
16
|
matches.size == 1 ? matches.first : matches
|
12
17
|
end
|
13
18
|
end
|
data/lib/scrape/robots_txt.rb
CHANGED
data/lib/scrape/site.rb
CHANGED
@@ -2,15 +2,14 @@ require 'addressable/uri'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
4
|
class Scrape::Site
|
5
|
-
attr_reader :url, :matches
|
6
|
-
attr_accessor :ignore_robots_txt
|
5
|
+
attr_reader :url, :matches, :options
|
7
6
|
|
8
7
|
def initialize url, options = {}
|
9
8
|
@url = Addressable::URI.parse url
|
10
9
|
@url.query = nil
|
11
10
|
@url.fragment = nil
|
11
|
+
@options = {:ignore_robots_txt => true}.merge options
|
12
12
|
@matches = []
|
13
|
-
@ignore_robots_txt = options.fetch(:ignore_robots_txt){ true }
|
14
13
|
end
|
15
14
|
|
16
15
|
def add_match matcher, &proc
|
@@ -19,9 +18,15 @@ class Scrape::Site
|
|
19
18
|
match
|
20
19
|
end
|
21
20
|
|
21
|
+
def open url
|
22
|
+
headers = Hash.new
|
23
|
+
headers['Set-Cookie'] = options[:cookie].to_s if options.has_key? :cookie
|
24
|
+
Scrape.open url, headers
|
25
|
+
end
|
26
|
+
|
22
27
|
def parse url
|
23
28
|
url = normalize url
|
24
|
-
doc = Nokogiri::HTML
|
29
|
+
doc = Nokogiri::HTML open(url)
|
25
30
|
|
26
31
|
@matches.each{|match| match.invoke doc, url if match =~ url }
|
27
32
|
|
@@ -38,11 +43,7 @@ class Scrape::Site
|
|
38
43
|
end
|
39
44
|
|
40
45
|
def robots_txt
|
41
|
-
@robots_txt
|
42
|
-
end
|
43
|
-
|
44
|
-
def ignore_robots_txt?
|
45
|
-
!!@ignore_robots_txt
|
46
|
+
@robots_txt = Scrape::RobotsTxt.load url unless defined? @robots_txt
|
46
47
|
end
|
47
48
|
|
48
49
|
def to_s
|
@@ -52,6 +53,6 @@ class Scrape::Site
|
|
52
53
|
private
|
53
54
|
|
54
55
|
def disallowed? url
|
55
|
-
!ignore_robots_txt
|
56
|
+
!options[:ignore_robots_txt] && robots_txt =~ Addressable::URI.parse(url).path
|
56
57
|
end
|
57
58
|
end
|
data/lib/scrape/version.rb
CHANGED
data/lib/scrape.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "logger"
|
3
3
|
require "open-uri"
|
4
|
-
require "bundler/setup"
|
5
4
|
|
6
|
-
|
5
|
+
$: << File.dirname(__FILE__)
|
7
6
|
|
8
|
-
|
9
|
-
|
7
|
+
require "scrape/version"
|
8
|
+
require "scrape/core_ext/array"
|
9
|
+
require "scrape/core_ext/string"
|
10
10
|
|
11
|
+
module Scrape
|
11
12
|
autoload 'Application', 'scrape/application'
|
12
13
|
autoload 'Site', 'scrape/site'
|
13
14
|
autoload 'Match', 'scrape/match'
|
data/scrape.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
17
17
|
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
|
19
|
-
s.require_paths = ["lib"]
|
19
|
+
s.require_paths = ["lib", "lib/scrape"]
|
20
20
|
|
21
21
|
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
22
22
|
s.add_development_dependency "addressable", "~> 2.2.8"
|
@@ -53,6 +53,7 @@ class ApplicationTest < Scrape::TestCase
|
|
53
53
|
test "#run should load the specified file" do
|
54
54
|
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
55
55
|
test_loader = MiniTest::Mock.new
|
56
|
+
test_loader.expect :class, Scrape::DefaultLoader
|
56
57
|
test_loader.expect :load, nil, [filepath]
|
57
58
|
Scrape::Application.new(filepath, {}, test_loader).run
|
58
59
|
assert test_loader.verify, "loader did not receive file"
|
@@ -70,12 +71,9 @@ class ApplicationTest < Scrape::TestCase
|
|
70
71
|
assert_equal ["http://example.com"], app.queue
|
71
72
|
end
|
72
73
|
|
73
|
-
test "#
|
74
|
-
site = Scrape::Site.new "http://www.example.com", :ignore_robots_txt => false
|
74
|
+
test "#add_site should add the specied string to the collection" do
|
75
75
|
app = Scrape::Application.new(".")
|
76
|
-
app.
|
77
|
-
|
78
|
-
app.ignore_robots_txt = true
|
79
|
-
assert_equal true, site.ignore_robots_txt
|
76
|
+
app.add_site "http://example.com"
|
77
|
+
assert app.sites.member?("http://example.com")
|
80
78
|
end
|
81
79
|
end
|
@@ -3,23 +3,27 @@ require "test_helper"
|
|
3
3
|
class DefaultLoaderTest < Scrape::TestCase
|
4
4
|
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
5
5
|
|
6
|
-
test "#load should
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
test "#load should parse the specified file" do
|
7
|
+
app = Scrape::Application.new "."
|
8
|
+
loader = Scrape::DefaultLoader.new app
|
9
|
+
loader.load File.join(SUPPORT_FILES, "test1.scrape")
|
10
|
+
assert_equal ["http://example.com"], app.sites.keys
|
11
|
+
assert_instance_of Scrape::Site, app.sites.values[0]
|
11
12
|
end
|
12
13
|
|
13
|
-
test "#load should
|
14
|
-
|
15
|
-
|
16
|
-
assert_equal Hash.new, sites
|
17
|
-
end
|
18
|
-
|
19
|
-
test "#load should raise an error when no site is defined" do
|
20
|
-
loader = Scrape::DefaultLoader.new
|
14
|
+
test "#load should raise error when no site is defined" do
|
15
|
+
app = Scrape::Application.new "."
|
16
|
+
loader = Scrape::DefaultLoader.new app
|
21
17
|
assert_raises ArgumentError do
|
22
18
|
loader.load File.join(SUPPORT_FILES, "test3.scrape")
|
23
19
|
end
|
24
20
|
end
|
21
|
+
|
22
|
+
test "#load should raise error when file cannot be found" do
|
23
|
+
app = Scrape::Application.new "."
|
24
|
+
loader = Scrape::DefaultLoader.new app
|
25
|
+
assert_raises Scrape::FileNotFound do
|
26
|
+
loader.load "#{Time.now.to_i}.txt"
|
27
|
+
end
|
28
|
+
end
|
25
29
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class DSLTest < Scrape::TestCase
|
4
|
+
test "#site should add the url to the application" do
|
5
|
+
app = Scrape::Application.new(".")
|
6
|
+
dsl = Scrape::DSL.new app
|
7
|
+
dsl.site "http://example.com"
|
8
|
+
assert app.sites.member?("http://example.com")
|
9
|
+
end
|
10
|
+
|
11
|
+
test "#site should return the currently defined sites" do
|
12
|
+
app = Scrape::Application.new(".")
|
13
|
+
dsl = Scrape::DSL.new app
|
14
|
+
sites = dsl.site "http://example.com"
|
15
|
+
assert_equal "http://example.com", sites[0].to_s
|
16
|
+
assert_equal sites, dsl.site
|
17
|
+
end
|
18
|
+
|
19
|
+
test "#site should pass the options to the site" do
|
20
|
+
app = Scrape::Application.new(".")
|
21
|
+
dsl = Scrape::DSL.new app
|
22
|
+
dsl.site "http://example.com", :test => true
|
23
|
+
assert_equal true, app.sites["http://example.com"].options[:test]
|
24
|
+
end
|
25
|
+
|
26
|
+
test "#match should create a match on the current sites" do
|
27
|
+
app = Scrape::Application.new(".")
|
28
|
+
dsl = Scrape::DSL.new app
|
29
|
+
dsl.site "http://example.com"
|
30
|
+
site = app.sites["http://example.com"]
|
31
|
+
assert_empty site.matches
|
32
|
+
dsl.match("test"){|*args|}
|
33
|
+
refute_empty site.matches
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#match should raise an error when no sites have been defined" do
|
37
|
+
app = Scrape::Application.new(".")
|
38
|
+
dsl = Scrape::DSL.new app
|
39
|
+
assert_raises ArgumentError do
|
40
|
+
dsl.match("test"){|*args|}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -68,4 +68,12 @@ class RobotsTxtTest < Scrape::TestCase
|
|
68
68
|
assert_equal ["Test"], robots.user_agents
|
69
69
|
assert_equal ["/foo", "/bar"], robots.disallows
|
70
70
|
end
|
71
|
+
|
72
|
+
test ".load should return nil when specified url results in 404" do
|
73
|
+
stub_request(:get, "http://www.example.com/robots.txt").
|
74
|
+
to_return(:status => 404, :body => "")
|
75
|
+
|
76
|
+
robots = Scrape::RobotsTxt.load "http://www.example.com/foo"
|
77
|
+
assert_nil robots
|
78
|
+
end
|
71
79
|
end
|
data/test/unit/site_test.rb
CHANGED
@@ -7,6 +7,15 @@ class SiteTest < Scrape::TestCase
|
|
7
7
|
assert_instance_of Scrape::Match, match
|
8
8
|
end
|
9
9
|
|
10
|
+
test "#open should include cookie header when cookie option is set" do
|
11
|
+
stub_request(:get, "http://www.example.com/").
|
12
|
+
with(:headers => {'Set-Cookie'=>'omnom'}).
|
13
|
+
to_return(:status => 200, :body => "")
|
14
|
+
|
15
|
+
site = Scrape::Site.new "http://www.example.com", :cookie => "omnom"
|
16
|
+
site.open "http://www.example.com"
|
17
|
+
end
|
18
|
+
|
10
19
|
test "#parse should return absolute urls that match the site's url" do
|
11
20
|
stub_request(:get, "http://www.example.com/test").
|
12
21
|
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -63,13 +63,14 @@ files:
|
|
63
63
|
- lib/scrape.rb
|
64
64
|
- lib/scrape/application.rb
|
65
65
|
- lib/scrape/cli.rb
|
66
|
+
- lib/scrape/core_ext/array.rb
|
67
|
+
- lib/scrape/core_ext/string.rb
|
66
68
|
- lib/scrape/default_loader.rb
|
67
69
|
- lib/scrape/dsl.rb
|
68
70
|
- lib/scrape/match.rb
|
69
71
|
- lib/scrape/robots_txt.rb
|
70
72
|
- lib/scrape/robots_txt_rules.rb
|
71
73
|
- lib/scrape/site.rb
|
72
|
-
- lib/scrape/string_ext.rb
|
73
74
|
- lib/scrape/version.rb
|
74
75
|
- scrape.gemspec
|
75
76
|
- test/support/test1.scrape
|
@@ -79,6 +80,7 @@ files:
|
|
79
80
|
- test/unit/application_test.rb
|
80
81
|
- test/unit/cli_test.rb
|
81
82
|
- test/unit/default_loader_test.rb
|
83
|
+
- test/unit/dsl_test.rb
|
82
84
|
- test/unit/match_test.rb
|
83
85
|
- test/unit/robots_txt_rules_test.rb
|
84
86
|
- test/unit/robots_txt_test.rb
|
@@ -90,6 +92,7 @@ post_install_message:
|
|
90
92
|
rdoc_options: []
|
91
93
|
require_paths:
|
92
94
|
- lib
|
95
|
+
- lib/scrape
|
93
96
|
required_ruby_version: !ruby/object:Gem::Requirement
|
94
97
|
none: false
|
95
98
|
requirements:
|