scrape 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +22 -0
- data/LICENSE +22 -0
- data/README.md +44 -0
- data/Rakefile +10 -0
- data/bin/scrape +8 -0
- data/examples/github.scrape +7 -0
- data/examples/google.scrape +5 -0
- data/lib/scrape/application.rb +57 -0
- data/lib/scrape/cli.rb +39 -0
- data/lib/scrape/default_loader.rb +19 -0
- data/lib/scrape/dsl.rb +13 -0
- data/lib/scrape/match.rb +23 -0
- data/lib/scrape/site.rb +31 -0
- data/lib/scrape/uri.rb +58 -0
- data/lib/scrape/version.rb +3 -0
- data/lib/scrape.rb +36 -0
- data/scrape.gemspec +22 -0
- data/test/support/test1.scrape +5 -0
- data/test/support/test2.scrape +1 -0
- data/test/support/test3.scrape +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/application_test.rb +72 -0
- data/test/unit/default_loader_test.rb +25 -0
- data/test/unit/match_test.rb +54 -0
- data/test/unit/site_test.rb +75 -0
- data/test/unit/uri_test.rb +52 -0
- metadata +90 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
scrape (0.1)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
addressable (2.2.8)
|
10
|
+
crack (0.3.1)
|
11
|
+
nokogiri (1.5.5)
|
12
|
+
webmock (1.8.7)
|
13
|
+
addressable (>= 2.2.7)
|
14
|
+
crack (>= 0.1.7)
|
15
|
+
|
16
|
+
PLATFORMS
|
17
|
+
ruby
|
18
|
+
|
19
|
+
DEPENDENCIES
|
20
|
+
nokogiri (~> 1.5.5)
|
21
|
+
scrape!
|
22
|
+
webmock (~> 1.8.7)
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Marty Zalega
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# Scrape
|
2
|
+
|
3
|
+
A really simple web scraper.
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
site "https://github.com/explore" # The site to scrape. Will be used as the base address.
|
7
|
+
|
8
|
+
match /evilmarty/ do |doc| # A regexp/string/proc to match against the current url.
|
9
|
+
|
10
|
+
doc.search('a[href]') # The nokogiri document of the contents of the current url.
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
site "http://www.tumblr.com" # Can define multiple sites
|
15
|
+
|
16
|
+
match "/tagged" do |doc|
|
17
|
+
# Do what ever we want with the document.
|
18
|
+
end
|
19
|
+
```
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
After creating a `Scrapefile` simple run:
|
24
|
+
|
25
|
+
```
|
26
|
+
scrape -f [FILE]
|
27
|
+
```
|
28
|
+
|
29
|
+
If no scapefile is specified then `Scrapefile` is used by default.
|
30
|
+
|
31
|
+
## Installation
|
32
|
+
|
33
|
+
Simply install the gem
|
34
|
+
|
35
|
+
```
|
36
|
+
gem install scrape
|
37
|
+
```
|
38
|
+
|
39
|
+
## TODO
|
40
|
+
|
41
|
+
* Fix bugs
|
42
|
+
* Add support for Robots.txt
|
43
|
+
* Depth limiting
|
44
|
+
* Better docs
|
data/Rakefile
ADDED
data/bin/scrape
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
class Scrape::Application
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history
|
3
|
+
|
4
|
+
def initialize scrapefile, loader = Scrape::DefaultLoader.new
|
5
|
+
@scrapefile = File.expand_path scrapefile
|
6
|
+
@loader = loader
|
7
|
+
@sites = {}
|
8
|
+
@queue = []
|
9
|
+
@history = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
load_scrapefile
|
14
|
+
|
15
|
+
while url = @queue.shift
|
16
|
+
Scrape.logger.info "Loading: #{url}..."
|
17
|
+
@history << url
|
18
|
+
if site = self[url]
|
19
|
+
if urls = site.parse(url)
|
20
|
+
enqueue *urls
|
21
|
+
Scrape.logger.info "Found #{urls.length} urls."
|
22
|
+
else
|
23
|
+
Scrape.logger.info "Done."
|
24
|
+
end
|
25
|
+
else
|
26
|
+
Scrape.logger.info "Not defined."
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def reset
|
32
|
+
@history = []
|
33
|
+
@queue = sites.values.map{|site| site.url.to_s }
|
34
|
+
end
|
35
|
+
|
36
|
+
def queue
|
37
|
+
@queue.dup
|
38
|
+
end
|
39
|
+
|
40
|
+
def enqueue *urls
|
41
|
+
urls.flatten.each do |url|
|
42
|
+
@queue << url unless @history.include?(url) || @queue.include?(url)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def [] url
|
47
|
+
@sites.values.detect{|site| site.url < url }
|
48
|
+
end
|
49
|
+
|
50
|
+
def load_scrapefile
|
51
|
+
return if @scrapefile_loaded
|
52
|
+
result = loader.load(scrapefile)
|
53
|
+
@sites.update result if result.is_a? Hash
|
54
|
+
reset
|
55
|
+
@scrapefile_loaded = true
|
56
|
+
end
|
57
|
+
end
|
data/lib/scrape/cli.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require "optparse"
|
2
|
+
|
3
|
+
class Scrape::CLI
|
4
|
+
attr_reader :command, :pwd
|
5
|
+
|
6
|
+
def initialize command, pwd = Dir.pwd
|
7
|
+
@command, @pwd = command, pwd
|
8
|
+
end
|
9
|
+
|
10
|
+
def run argv
|
11
|
+
options = {:file => File.join(pwd, 'Scrapefile')}
|
12
|
+
opts = OptionParser.new do |opts|
|
13
|
+
opts.banner = "Scrape #{Scrape::VERSION} - Usage: #{command} [options]"
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Specific options:"
|
16
|
+
|
17
|
+
opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
|
18
|
+
options[:file] = File.expand_path file
|
19
|
+
end
|
20
|
+
opts.on_tail "-h", "--help", "Show this message" do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
opts.on_tail "-v", "--version", "Show version" do
|
25
|
+
puts Scrape::VERSION
|
26
|
+
exit
|
27
|
+
end
|
28
|
+
end
|
29
|
+
opts.parse argv
|
30
|
+
|
31
|
+
if File.exists? options[:file]
|
32
|
+
Scrape::Application.new(options[:file]).run
|
33
|
+
else
|
34
|
+
puts "#{command} aborted!"
|
35
|
+
puts "No Scrapefile found"
|
36
|
+
exit -1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Scrape::DefaultLoader
|
2
|
+
def load path
|
3
|
+
path = File.expand_path path
|
4
|
+
sites = {}
|
5
|
+
|
6
|
+
sandbox = Sandbox.new sites
|
7
|
+
sandbox.instance_eval File.read(path), path
|
8
|
+
|
9
|
+
sites
|
10
|
+
end
|
11
|
+
|
12
|
+
class Sandbox
|
13
|
+
include Scrape::DSL
|
14
|
+
|
15
|
+
def initialize sites
|
16
|
+
@sites = sites
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/scrape/dsl.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrape::DSL
|
2
|
+
def site *urls
|
3
|
+
@_sites ||= {}
|
4
|
+
@sites ||= {}
|
5
|
+
@current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
|
6
|
+
end
|
7
|
+
|
8
|
+
def match matcher, &proc
|
9
|
+
raise ArgumentError.new("site must be set") unless defined? @current_sites
|
10
|
+
matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
|
11
|
+
matches.size == 1 ? matches.first : matches
|
12
|
+
end
|
13
|
+
end
|
data/lib/scrape/match.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
class Scrape::Match
|
2
|
+
attr_reader :matcher
|
3
|
+
|
4
|
+
def initialize matcher, &proc
|
5
|
+
@matcher, @proc = matcher, proc
|
6
|
+
raise ArgumentError.new("Match block expects one argument") if proc.arity != 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def invoke doc
|
10
|
+
@proc.call doc
|
11
|
+
end
|
12
|
+
|
13
|
+
def =~ url
|
14
|
+
case @matcher
|
15
|
+
when String
|
16
|
+
url.to_s.include? @matcher
|
17
|
+
when Regexp
|
18
|
+
url.to_s =~ @matcher
|
19
|
+
when Proc
|
20
|
+
@matcher.call url
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/scrape/site.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Scrape::Site
|
4
|
+
attr_reader :url, :matches
|
5
|
+
|
6
|
+
def initialize url
|
7
|
+
@url = Scrape::URI.new url
|
8
|
+
@url.query = nil
|
9
|
+
@url.fragment = nil
|
10
|
+
@matches = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_match matcher, &proc
|
14
|
+
match = Scrape::Match.new(matcher, &proc)
|
15
|
+
@matches << match
|
16
|
+
match
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse url
|
20
|
+
url = self.url + url
|
21
|
+
doc = Nokogiri::HTML url.open
|
22
|
+
|
23
|
+
@matches.each{|match| match.invoke doc if match =~ url }
|
24
|
+
|
25
|
+
urls = doc.css("a[href]").map do |node|
|
26
|
+
href = self.url + node['href']
|
27
|
+
self.url < href ? href : nil
|
28
|
+
end
|
29
|
+
urls.compact
|
30
|
+
end
|
31
|
+
end
|
data/lib/scrape/uri.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
class Scrape::URI
|
5
|
+
def initialize uri = nil
|
6
|
+
@uri = case uri
|
7
|
+
when URI then uri.clone
|
8
|
+
when NilClass then URI.new
|
9
|
+
else URI.parse uri.to_s
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
%w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
|
14
|
+
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
15
|
+
def #{method_name}
|
16
|
+
@uri.#{method_name}
|
17
|
+
end
|
18
|
+
EOT
|
19
|
+
end
|
20
|
+
|
21
|
+
%w[fragment host hostname password path port query scheme user].each do |method_name|
|
22
|
+
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
23
|
+
def #{method_name}= value
|
24
|
+
@uri.#{method_name} = value
|
25
|
+
end
|
26
|
+
EOT
|
27
|
+
end
|
28
|
+
|
29
|
+
def + url
|
30
|
+
return clone if self == url
|
31
|
+
relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
|
32
|
+
uri = self.class.new @uri.merge(url)
|
33
|
+
uri.path = "#{@uri.path}#{uri.path}" if relative
|
34
|
+
uri
|
35
|
+
end
|
36
|
+
|
37
|
+
def < url
|
38
|
+
url[0, length] == to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def [] *args
|
42
|
+
to_s[*args]
|
43
|
+
end
|
44
|
+
|
45
|
+
def == url
|
46
|
+
to_s == url.to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
def length
|
50
|
+
to_s.length
|
51
|
+
end
|
52
|
+
alias_method :size, :length
|
53
|
+
|
54
|
+
def open headers = {}, &block
|
55
|
+
headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
|
56
|
+
super(to_s, headers, &block).read
|
57
|
+
end
|
58
|
+
end
|
data/lib/scrape.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "logger"
|
3
|
+
require "bundler/setup"
|
4
|
+
|
5
|
+
module Scrape
|
6
|
+
require 'scrape/version'
|
7
|
+
|
8
|
+
autoload 'Application', 'scrape/application'
|
9
|
+
autoload 'Site', 'scrape/site'
|
10
|
+
autoload 'Match', 'scrape/match'
|
11
|
+
autoload 'DefaultLoader', 'scrape/default_loader'
|
12
|
+
autoload 'DSL', 'scrape/dsl'
|
13
|
+
autoload 'URI', 'scrape/uri'
|
14
|
+
|
15
|
+
class ScrapeFileNotFound < Exception; end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
attr_writer :user_agent
|
19
|
+
|
20
|
+
def user_agent
|
21
|
+
@user_agent || "Scrape/#{Scrape::VERSION}"
|
22
|
+
end
|
23
|
+
|
24
|
+
def logger
|
25
|
+
@logger ||= Logger.new STDOUT
|
26
|
+
end
|
27
|
+
|
28
|
+
def logger= log
|
29
|
+
@logger = log
|
30
|
+
end
|
31
|
+
|
32
|
+
def load_scrapefile path
|
33
|
+
Application.new path
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/scrape.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/scrape/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "scrape"
|
6
|
+
s.version = Scrape::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Marty Zalega"]
|
9
|
+
s.email = ["evilmarty@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/evilmarty/scrape"
|
11
|
+
s.summary = %q{A really simple web scraper}
|
12
|
+
s.description = %q{An easy to use utility to scrape websites using a DSL similar to rake.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "scrape"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
22
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
site "http://example.com"
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
$: << File.expand_path('../../lib', __FILE__)
|
2
|
+
|
3
|
+
require "minitest/autorun"
|
4
|
+
require "webmock/minitest"
|
5
|
+
|
6
|
+
require "bundler/setup"
|
7
|
+
Bundler.setup(:default, :test)
|
8
|
+
|
9
|
+
require "scrape"
|
10
|
+
|
11
|
+
class Scrape::TestCase < MiniTest::Unit::TestCase
|
12
|
+
class << self
|
13
|
+
def test name, &block
|
14
|
+
method_name = name.gsub /[^a-z0-9_]+/i, '_'
|
15
|
+
define_method "test_#{method_name}", &block
|
16
|
+
end
|
17
|
+
private :test
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class ApplicationTest < Scrape::TestCase
|
4
|
+
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
5
|
+
|
6
|
+
test "#load_scrapefile should parse the specified file" do
|
7
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
8
|
+
app = Scrape::Application.new(filepath)
|
9
|
+
assert app.load_scrapefile, "scrape file failed to load"
|
10
|
+
assert_equal ["http://example.com"], app.sites.keys
|
11
|
+
end
|
12
|
+
|
13
|
+
test "#load_scrapefile should return nil when already loaded" do
|
14
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
15
|
+
app = Scrape::Application.new(filepath)
|
16
|
+
assert app.load_scrapefile, "scrape file failed to load"
|
17
|
+
refute app.load_scrapefile, "scrape file should not have loaded again"
|
18
|
+
end
|
19
|
+
|
20
|
+
test "#[] should return the site that matches the given url" do
|
21
|
+
site1 = Scrape::Site.new "http://example.com"
|
22
|
+
site2 = Scrape::Site.new "http://example.org"
|
23
|
+
app = Scrape::Application.new(".")
|
24
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
25
|
+
assert_equal site1, app["http://example.com"]
|
26
|
+
end
|
27
|
+
|
28
|
+
test "#[] should return the site that is relative to the given url" do
|
29
|
+
site1 = Scrape::Site.new "http://example.com"
|
30
|
+
site2 = Scrape::Site.new "http://example.org"
|
31
|
+
app = Scrape::Application.new(".")
|
32
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
33
|
+
assert_equal site1, app["http://example.com/test"]
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#[] should return nil when no site matches the given url" do
|
37
|
+
site1 = Scrape::Site.new "http://example.com"
|
38
|
+
site2 = Scrape::Site.new "http://example.org"
|
39
|
+
app = Scrape::Application.new(".")
|
40
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
41
|
+
assert_nil app["http://example.net"]
|
42
|
+
end
|
43
|
+
|
44
|
+
test "#reset should enqueue the sites that have been defined" do
|
45
|
+
site1 = Scrape::Site.new "http://example.com"
|
46
|
+
site2 = Scrape::Site.new "http://example.org"
|
47
|
+
app = Scrape::Application.new(".")
|
48
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
49
|
+
app.reset
|
50
|
+
assert_equal ["http://example.com", "http://example.org"], app.queue
|
51
|
+
end
|
52
|
+
|
53
|
+
test "#run should load the specified file" do
|
54
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
55
|
+
test_loader = MiniTest::Mock.new
|
56
|
+
test_loader.expect :load, nil, [filepath]
|
57
|
+
Scrape::Application.new(filepath, test_loader).run
|
58
|
+
assert test_loader.verify, "loader did not receive file"
|
59
|
+
end
|
60
|
+
|
61
|
+
test "#enqueue should add the given url to the queue" do
|
62
|
+
app = Scrape::Application.new(".")
|
63
|
+
app.enqueue "http://example.com"
|
64
|
+
assert_equal ["http://example.com"], app.queue
|
65
|
+
end
|
66
|
+
|
67
|
+
test "#enqueue should not add the given to the queue when it already is added" do
|
68
|
+
app = Scrape::Application.new(".")
|
69
|
+
3.times{ app.enqueue "http://example.com" }
|
70
|
+
assert_equal ["http://example.com"], app.queue
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class DefaultLoaderTest < Scrape::TestCase
|
4
|
+
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
5
|
+
|
6
|
+
test "#load should return sites parsed from the specified file" do
|
7
|
+
loader = Scrape::DefaultLoader.new
|
8
|
+
sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
|
9
|
+
assert_equal ["http://example.com"], sites.keys
|
10
|
+
assert_instance_of Scrape::Site, sites.values[0]
|
11
|
+
end
|
12
|
+
|
13
|
+
test "#load should return an empty hash when no matches have been defined" do
|
14
|
+
loader = Scrape::DefaultLoader.new
|
15
|
+
sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
|
16
|
+
assert_equal Hash.new, sites
|
17
|
+
end
|
18
|
+
|
19
|
+
test "#load should raise an error when no site is defined" do
|
20
|
+
loader = Scrape::DefaultLoader.new
|
21
|
+
assert_raises ArgumentError do
|
22
|
+
loader.load File.join(SUPPORT_FILES, "test3.scrape")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class MatchTest < Scrape::TestCase
|
4
|
+
test "#initialize should raise error when proc's arity isn't one" do
|
5
|
+
assert_raises ArgumentError do
|
6
|
+
Scrape::Match.new("test"){ "no arguments" }
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
test "#invoke should call the proc" do
|
11
|
+
ok = false
|
12
|
+
match = Scrape::Match.new("test"){|doc| ok = true }
|
13
|
+
match.invoke nil
|
14
|
+
assert ok, "Proc was not called"
|
15
|
+
end
|
16
|
+
|
17
|
+
test "#invoke should pass the document to the proc" do
|
18
|
+
doc = "yay"
|
19
|
+
ok = false
|
20
|
+
match = Scrape::Match.new("test"){|d| ok = (doc == d) }
|
21
|
+
match.invoke doc
|
22
|
+
assert ok, "Document was not passed into the proc"
|
23
|
+
end
|
24
|
+
|
25
|
+
test "#=~ should return true when contains string" do
|
26
|
+
match = Scrape::Match.new("bar"){|doc|}
|
27
|
+
assert match =~ "foobar", "Expected true"
|
28
|
+
end
|
29
|
+
|
30
|
+
test "#=~ should return false when doesn't contains string" do
|
31
|
+
match = Scrape::Match.new("bar"){|doc|}
|
32
|
+
refute match =~ "ponies", "Expected false"
|
33
|
+
end
|
34
|
+
|
35
|
+
test "#=~ should return true when matches regexp" do
|
36
|
+
match = Scrape::Match.new(/bar/){|doc|}
|
37
|
+
assert match =~ "foobar", "Expected true"
|
38
|
+
end
|
39
|
+
|
40
|
+
test "#=~ should return false when doesn't match regexp" do
|
41
|
+
match = Scrape::Match.new(/bar/){|doc|}
|
42
|
+
refute match =~ "ponies", "Expected false"
|
43
|
+
end
|
44
|
+
|
45
|
+
test "#=~ should return true when proc is truthy" do
|
46
|
+
match = Scrape::Match.new(lambda{|url| true }){|doc|}
|
47
|
+
assert match =~ "ponies", "Expected true"
|
48
|
+
end
|
49
|
+
|
50
|
+
test "#=~ should return false when proc is falsy" do
|
51
|
+
match = Scrape::Match.new(lambda{|url| false }){|doc|}
|
52
|
+
refute match =~ "ponies", "Expected false"
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class SiteTest < Scrape::TestCase
|
4
|
+
test "#add_match should create a Match object and be added to the collection" do
|
5
|
+
site = Scrape::Site.new "http://www.example.com"
|
6
|
+
match = site.add_match("/test") { |doc| }
|
7
|
+
assert_instance_of Scrape::Match, match
|
8
|
+
end
|
9
|
+
|
10
|
+
test "#parse should return absolute urls that match the site's url" do
|
11
|
+
stub_request(:get, "http://www.example.com/test").
|
12
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
13
|
+
to_return(:status => 200, :body => <<-HTML)
|
14
|
+
<html>
|
15
|
+
<body>
|
16
|
+
<a href="http://www.example.com/link1.html">link 1</a>
|
17
|
+
<a href="http://example.com/link2.html">link 2</a>
|
18
|
+
<a href="http://example.org/link3.html">link 3</a>
|
19
|
+
</body>
|
20
|
+
</html>
|
21
|
+
HTML
|
22
|
+
|
23
|
+
site = Scrape::Site.new "http://www.example.com"
|
24
|
+
assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
|
25
|
+
end
|
26
|
+
|
27
|
+
test "#parse should return relative urls to the site" do
|
28
|
+
stub_request(:get, "http://www.example.com/test").
|
29
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
30
|
+
to_return(:status => 200, :body => <<-HTML)
|
31
|
+
<html>
|
32
|
+
<body>
|
33
|
+
<a href="link1.html">link 1</a>
|
34
|
+
</body>
|
35
|
+
</html>
|
36
|
+
HTML
|
37
|
+
|
38
|
+
site = Scrape::Site.new "http://www.example.com"
|
39
|
+
assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
|
40
|
+
end
|
41
|
+
|
42
|
+
test "#parse should return no urls" do
|
43
|
+
stub_request(:get, "http://www.example.com/test").
|
44
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
45
|
+
to_return(:status => 200, :body => <<-HTML)
|
46
|
+
<html>
|
47
|
+
<body>
|
48
|
+
<a href="/link1.html">link 1</a>
|
49
|
+
</body>
|
50
|
+
</html>
|
51
|
+
HTML
|
52
|
+
|
53
|
+
site = Scrape::Site.new "http://www.example.com/test"
|
54
|
+
assert_equal [], site.parse("/test")
|
55
|
+
end
|
56
|
+
|
57
|
+
test "#parse should invoke Match when hit" do
|
58
|
+
stub_request(:get, "http://www.example.com/test").
|
59
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
60
|
+
to_return(:status => 200, :body => <<-HTML)
|
61
|
+
<html>
|
62
|
+
<body>
|
63
|
+
<a href="link1.html">link 1</a>
|
64
|
+
</body>
|
65
|
+
</html>
|
66
|
+
HTML
|
67
|
+
|
68
|
+
ok = false
|
69
|
+
site = Scrape::Site.new "http://www.example.com"
|
70
|
+
site.add_match(/test/){|doc| ok = true }
|
71
|
+
site.parse "/test"
|
72
|
+
|
73
|
+
assert ok, "Match was not invoked"
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "test_helper"
|
2
|
+
|
3
|
+
class URITest < Scrape::TestCase
|
4
|
+
{fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
|
5
|
+
test "##{method_name} should return value" do
|
6
|
+
uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
|
7
|
+
assert_equal value, uri.send(method_name)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
test "#open should return the contents at the url" do
|
12
|
+
stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
|
13
|
+
|
14
|
+
uri = Scrape::URI.new "http://www.example.com"
|
15
|
+
assert_equal "Howdie", uri.open
|
16
|
+
end
|
17
|
+
|
18
|
+
test "#+ should return a URI with the specified path" do
|
19
|
+
uri1 = Scrape::URI.new "http://www.example.com"
|
20
|
+
uri2 = uri1 + "/bar"
|
21
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
test "#+ should return a URI overwriting with the specified path" do
|
25
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
26
|
+
uri2 = uri1 + "/bar"
|
27
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
test "#+ should return a URI with the specified path appended" do
|
31
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
32
|
+
uri2 = uri1 + "bar"
|
33
|
+
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
34
|
+
end
|
35
|
+
|
36
|
+
test "#+ should return a URI from the absolute url" do
|
37
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
38
|
+
uri2 = uri1 + "http://www.example.com/bar"
|
39
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
test "#+ should return a URI appended from the absolute url" do
|
43
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
44
|
+
uri2 = uri1 + "http://www.example.com/foo/bar"
|
45
|
+
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
46
|
+
end
|
47
|
+
|
48
|
+
test "#< should return true when specified url is greater" do
|
49
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
50
|
+
assert uri1 < "http://www.example.com/foo/bar"
|
51
|
+
end
|
52
|
+
end
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrape
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Marty Zalega
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.5.5
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.5.5
|
30
|
+
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
31
|
+
email:
|
32
|
+
- evilmarty@gmail.com
|
33
|
+
executables:
|
34
|
+
- scrape
|
35
|
+
extensions: []
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- .gitignore
|
39
|
+
- Gemfile
|
40
|
+
- Gemfile.lock
|
41
|
+
- LICENSE
|
42
|
+
- README.md
|
43
|
+
- Rakefile
|
44
|
+
- bin/scrape
|
45
|
+
- examples/github.scrape
|
46
|
+
- examples/google.scrape
|
47
|
+
- lib/scrape.rb
|
48
|
+
- lib/scrape/application.rb
|
49
|
+
- lib/scrape/cli.rb
|
50
|
+
- lib/scrape/default_loader.rb
|
51
|
+
- lib/scrape/dsl.rb
|
52
|
+
- lib/scrape/match.rb
|
53
|
+
- lib/scrape/site.rb
|
54
|
+
- lib/scrape/uri.rb
|
55
|
+
- lib/scrape/version.rb
|
56
|
+
- scrape.gemspec
|
57
|
+
- test/support/test1.scrape
|
58
|
+
- test/support/test2.scrape
|
59
|
+
- test/support/test3.scrape
|
60
|
+
- test/test_helper.rb
|
61
|
+
- test/unit/application_test.rb
|
62
|
+
- test/unit/default_loader_test.rb
|
63
|
+
- test/unit/match_test.rb
|
64
|
+
- test/unit/site_test.rb
|
65
|
+
- test/unit/uri_test.rb
|
66
|
+
homepage: http://github.com/evilmarty/scrape
|
67
|
+
licenses: []
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
none: false
|
80
|
+
requirements:
|
81
|
+
- - ! '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project: scrape
|
86
|
+
rubygems_version: 1.8.23
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: A really simple web scraper
|
90
|
+
test_files: []
|