scrape 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +22 -0
- data/LICENSE +22 -0
- data/README.md +44 -0
- data/Rakefile +10 -0
- data/bin/scrape +8 -0
- data/examples/github.scrape +7 -0
- data/examples/google.scrape +5 -0
- data/lib/scrape/application.rb +57 -0
- data/lib/scrape/cli.rb +39 -0
- data/lib/scrape/default_loader.rb +19 -0
- data/lib/scrape/dsl.rb +13 -0
- data/lib/scrape/match.rb +23 -0
- data/lib/scrape/site.rb +31 -0
- data/lib/scrape/uri.rb +58 -0
- data/lib/scrape/version.rb +3 -0
- data/lib/scrape.rb +36 -0
- data/scrape.gemspec +22 -0
- data/test/support/test1.scrape +5 -0
- data/test/support/test2.scrape +1 -0
- data/test/support/test3.scrape +3 -0
- data/test/test_helper.rb +19 -0
- data/test/unit/application_test.rb +72 -0
- data/test/unit/default_loader_test.rb +25 -0
- data/test/unit/match_test.rb +54 -0
- data/test/unit/site_test.rb +75 -0
- data/test/unit/uri_test.rb +52 -0
- metadata +90 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
scrape (0.1)
|
|
5
|
+
|
|
6
|
+
GEM
|
|
7
|
+
remote: http://rubygems.org/
|
|
8
|
+
specs:
|
|
9
|
+
addressable (2.2.8)
|
|
10
|
+
crack (0.3.1)
|
|
11
|
+
nokogiri (1.5.5)
|
|
12
|
+
webmock (1.8.7)
|
|
13
|
+
addressable (>= 2.2.7)
|
|
14
|
+
crack (>= 0.1.7)
|
|
15
|
+
|
|
16
|
+
PLATFORMS
|
|
17
|
+
ruby
|
|
18
|
+
|
|
19
|
+
DEPENDENCIES
|
|
20
|
+
nokogiri (~> 1.5.5)
|
|
21
|
+
scrape!
|
|
22
|
+
webmock (~> 1.8.7)
|
data/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2012 Marty Zalega
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Scrape
|
|
2
|
+
|
|
3
|
+
A really simple web scraper.
|
|
4
|
+
|
|
5
|
+
```ruby
|
|
6
|
+
site "https://github.com/explore" # The site to scrape. Will be used as the base address.
|
|
7
|
+
|
|
8
|
+
match /evilmarty/ do |doc| # A regexp/string/proc to match against the current url.
|
|
9
|
+
|
|
10
|
+
doc.search('a[href]') # The nokogiri document of the contents of the current url.
|
|
11
|
+
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
site "http://www.tumblr.com" # Can define multiple sites
|
|
15
|
+
|
|
16
|
+
match "/tagged" do |doc|
|
|
17
|
+
# Do what ever we want with the document.
|
|
18
|
+
end
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
After creating a `Scrapefile` simple run:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
scrape -f [FILE]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
If no scapefile is specified then `Scrapefile` is used by default.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Simply install the gem
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
gem install scrape
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## TODO
|
|
40
|
+
|
|
41
|
+
* Fix bugs
|
|
42
|
+
* Add support for Robots.txt
|
|
43
|
+
* Depth limiting
|
|
44
|
+
* Better docs
|
data/Rakefile
ADDED
data/bin/scrape
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
class Scrape::Application
|
|
2
|
+
attr_reader :scrapefile, :loader, :sites, :history
|
|
3
|
+
|
|
4
|
+
def initialize scrapefile, loader = Scrape::DefaultLoader.new
|
|
5
|
+
@scrapefile = File.expand_path scrapefile
|
|
6
|
+
@loader = loader
|
|
7
|
+
@sites = {}
|
|
8
|
+
@queue = []
|
|
9
|
+
@history = []
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def run
|
|
13
|
+
load_scrapefile
|
|
14
|
+
|
|
15
|
+
while url = @queue.shift
|
|
16
|
+
Scrape.logger.info "Loading: #{url}..."
|
|
17
|
+
@history << url
|
|
18
|
+
if site = self[url]
|
|
19
|
+
if urls = site.parse(url)
|
|
20
|
+
enqueue *urls
|
|
21
|
+
Scrape.logger.info "Found #{urls.length} urls."
|
|
22
|
+
else
|
|
23
|
+
Scrape.logger.info "Done."
|
|
24
|
+
end
|
|
25
|
+
else
|
|
26
|
+
Scrape.logger.info "Not defined."
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def reset
|
|
32
|
+
@history = []
|
|
33
|
+
@queue = sites.values.map{|site| site.url.to_s }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def queue
|
|
37
|
+
@queue.dup
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def enqueue *urls
|
|
41
|
+
urls.flatten.each do |url|
|
|
42
|
+
@queue << url unless @history.include?(url) || @queue.include?(url)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def [] url
|
|
47
|
+
@sites.values.detect{|site| site.url < url }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def load_scrapefile
|
|
51
|
+
return if @scrapefile_loaded
|
|
52
|
+
result = loader.load(scrapefile)
|
|
53
|
+
@sites.update result if result.is_a? Hash
|
|
54
|
+
reset
|
|
55
|
+
@scrapefile_loaded = true
|
|
56
|
+
end
|
|
57
|
+
end
|
data/lib/scrape/cli.rb
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require "optparse"
|
|
2
|
+
|
|
3
|
+
class Scrape::CLI
|
|
4
|
+
attr_reader :command, :pwd
|
|
5
|
+
|
|
6
|
+
def initialize command, pwd = Dir.pwd
|
|
7
|
+
@command, @pwd = command, pwd
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def run argv
|
|
11
|
+
options = {:file => File.join(pwd, 'Scrapefile')}
|
|
12
|
+
opts = OptionParser.new do |opts|
|
|
13
|
+
opts.banner = "Scrape #{Scrape::VERSION} - Usage: #{command} [options]"
|
|
14
|
+
opts.separator ""
|
|
15
|
+
opts.separator "Specific options:"
|
|
16
|
+
|
|
17
|
+
opts.on "-f", "--scrapefile [FILE]", "Use FILE as scrapefile" do |file|
|
|
18
|
+
options[:file] = File.expand_path file
|
|
19
|
+
end
|
|
20
|
+
opts.on_tail "-h", "--help", "Show this message" do
|
|
21
|
+
puts opts
|
|
22
|
+
exit
|
|
23
|
+
end
|
|
24
|
+
opts.on_tail "-v", "--version", "Show version" do
|
|
25
|
+
puts Scrape::VERSION
|
|
26
|
+
exit
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
opts.parse argv
|
|
30
|
+
|
|
31
|
+
if File.exists? options[:file]
|
|
32
|
+
Scrape::Application.new(options[:file]).run
|
|
33
|
+
else
|
|
34
|
+
puts "#{command} aborted!"
|
|
35
|
+
puts "No Scrapefile found"
|
|
36
|
+
exit -1
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
class Scrape::DefaultLoader
|
|
2
|
+
def load path
|
|
3
|
+
path = File.expand_path path
|
|
4
|
+
sites = {}
|
|
5
|
+
|
|
6
|
+
sandbox = Sandbox.new sites
|
|
7
|
+
sandbox.instance_eval File.read(path), path
|
|
8
|
+
|
|
9
|
+
sites
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class Sandbox
|
|
13
|
+
include Scrape::DSL
|
|
14
|
+
|
|
15
|
+
def initialize sites
|
|
16
|
+
@sites = sites
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
data/lib/scrape/dsl.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
module Scrape::DSL
|
|
2
|
+
def site *urls
|
|
3
|
+
@_sites ||= {}
|
|
4
|
+
@sites ||= {}
|
|
5
|
+
@current_sites = urls.flatten.map{|url| @_sites[url] ||= Scrape::Site.new(url) }
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def match matcher, &proc
|
|
9
|
+
raise ArgumentError.new("site must be set") unless defined? @current_sites
|
|
10
|
+
matches = @current_sites.map{|site| @sites[site.url.to_s] = site; site.add_match matcher, &proc }
|
|
11
|
+
matches.size == 1 ? matches.first : matches
|
|
12
|
+
end
|
|
13
|
+
end
|
data/lib/scrape/match.rb
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
class Scrape::Match
|
|
2
|
+
attr_reader :matcher
|
|
3
|
+
|
|
4
|
+
def initialize matcher, &proc
|
|
5
|
+
@matcher, @proc = matcher, proc
|
|
6
|
+
raise ArgumentError.new("Match block expects one argument") if proc.arity != 1
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def invoke doc
|
|
10
|
+
@proc.call doc
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def =~ url
|
|
14
|
+
case @matcher
|
|
15
|
+
when String
|
|
16
|
+
url.to_s.include? @matcher
|
|
17
|
+
when Regexp
|
|
18
|
+
url.to_s =~ @matcher
|
|
19
|
+
when Proc
|
|
20
|
+
@matcher.call url
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
data/lib/scrape/site.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
class Scrape::Site
|
|
4
|
+
attr_reader :url, :matches
|
|
5
|
+
|
|
6
|
+
def initialize url
|
|
7
|
+
@url = Scrape::URI.new url
|
|
8
|
+
@url.query = nil
|
|
9
|
+
@url.fragment = nil
|
|
10
|
+
@matches = []
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def add_match matcher, &proc
|
|
14
|
+
match = Scrape::Match.new(matcher, &proc)
|
|
15
|
+
@matches << match
|
|
16
|
+
match
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def parse url
|
|
20
|
+
url = self.url + url
|
|
21
|
+
doc = Nokogiri::HTML url.open
|
|
22
|
+
|
|
23
|
+
@matches.each{|match| match.invoke doc if match =~ url }
|
|
24
|
+
|
|
25
|
+
urls = doc.css("a[href]").map do |node|
|
|
26
|
+
href = self.url + node['href']
|
|
27
|
+
self.url < href ? href : nil
|
|
28
|
+
end
|
|
29
|
+
urls.compact
|
|
30
|
+
end
|
|
31
|
+
end
|
data/lib/scrape/uri.rb
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
require 'uri'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
|
|
4
|
+
class Scrape::URI
|
|
5
|
+
def initialize uri = nil
|
|
6
|
+
@uri = case uri
|
|
7
|
+
when URI then uri.clone
|
|
8
|
+
when NilClass then URI.new
|
|
9
|
+
else URI.parse uri.to_s
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
%w[fragment host hostname password path port query scheme user to_s relative? absolute?].each do |method_name|
|
|
14
|
+
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
|
15
|
+
def #{method_name}
|
|
16
|
+
@uri.#{method_name}
|
|
17
|
+
end
|
|
18
|
+
EOT
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
%w[fragment host hostname password path port query scheme user].each do |method_name|
|
|
22
|
+
class_eval <<-EOT, __FILE__, __LINE__ + 1
|
|
23
|
+
def #{method_name}= value
|
|
24
|
+
@uri.#{method_name} = value
|
|
25
|
+
end
|
|
26
|
+
EOT
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def + url
|
|
30
|
+
return clone if self == url
|
|
31
|
+
relative = (url.to_s =~ /^(?!.+:\/\/|\/)/)
|
|
32
|
+
uri = self.class.new @uri.merge(url)
|
|
33
|
+
uri.path = "#{@uri.path}#{uri.path}" if relative
|
|
34
|
+
uri
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def < url
|
|
38
|
+
url[0, length] == to_s
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def [] *args
|
|
42
|
+
to_s[*args]
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def == url
|
|
46
|
+
to_s == url.to_s
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def length
|
|
50
|
+
to_s.length
|
|
51
|
+
end
|
|
52
|
+
alias_method :size, :length
|
|
53
|
+
|
|
54
|
+
def open headers = {}, &block
|
|
55
|
+
headers = {"User-Agent" => Scrape.user_agent}.merge(headers)
|
|
56
|
+
super(to_s, headers, &block).read
|
|
57
|
+
end
|
|
58
|
+
end
|
data/lib/scrape.rb
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require "rubygems"
|
|
2
|
+
require "logger"
|
|
3
|
+
require "bundler/setup"
|
|
4
|
+
|
|
5
|
+
module Scrape
|
|
6
|
+
require 'scrape/version'
|
|
7
|
+
|
|
8
|
+
autoload 'Application', 'scrape/application'
|
|
9
|
+
autoload 'Site', 'scrape/site'
|
|
10
|
+
autoload 'Match', 'scrape/match'
|
|
11
|
+
autoload 'DefaultLoader', 'scrape/default_loader'
|
|
12
|
+
autoload 'DSL', 'scrape/dsl'
|
|
13
|
+
autoload 'URI', 'scrape/uri'
|
|
14
|
+
|
|
15
|
+
class ScrapeFileNotFound < Exception; end
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
attr_writer :user_agent
|
|
19
|
+
|
|
20
|
+
def user_agent
|
|
21
|
+
@user_agent || "Scrape/#{Scrape::VERSION}"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def logger
|
|
25
|
+
@logger ||= Logger.new STDOUT
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def logger= log
|
|
29
|
+
@logger = log
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def load_scrapefile path
|
|
33
|
+
Application.new path
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
data/scrape.gemspec
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
require File.expand_path("../lib/scrape/version", __FILE__)
|
|
3
|
+
|
|
4
|
+
Gem::Specification.new do |s|
|
|
5
|
+
s.name = "scrape"
|
|
6
|
+
s.version = Scrape::VERSION
|
|
7
|
+
s.platform = Gem::Platform::RUBY
|
|
8
|
+
s.authors = ["Marty Zalega"]
|
|
9
|
+
s.email = ["evilmarty@gmail.com"]
|
|
10
|
+
s.homepage = "http://github.com/evilmarty/scrape"
|
|
11
|
+
s.summary = %q{A really simple web scraper}
|
|
12
|
+
s.description = %q{An easy to use utility to scrape websites using a DSL similar to rake.}
|
|
13
|
+
|
|
14
|
+
s.rubyforge_project = "scrape"
|
|
15
|
+
|
|
16
|
+
s.files = `git ls-files`.split("\n")
|
|
17
|
+
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{|f| File.basename(f) }
|
|
19
|
+
s.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
s.add_development_dependency "nokogiri", "~> 1.5.5"
|
|
22
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
site "http://example.com"
|
data/test/test_helper.rb
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
$: << File.expand_path('../../lib', __FILE__)
|
|
2
|
+
|
|
3
|
+
require "minitest/autorun"
|
|
4
|
+
require "webmock/minitest"
|
|
5
|
+
|
|
6
|
+
require "bundler/setup"
|
|
7
|
+
Bundler.setup(:default, :test)
|
|
8
|
+
|
|
9
|
+
require "scrape"
|
|
10
|
+
|
|
11
|
+
class Scrape::TestCase < MiniTest::Unit::TestCase
|
|
12
|
+
class << self
|
|
13
|
+
def test name, &block
|
|
14
|
+
method_name = name.gsub /[^a-z0-9_]+/i, '_'
|
|
15
|
+
define_method "test_#{method_name}", &block
|
|
16
|
+
end
|
|
17
|
+
private :test
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class ApplicationTest < Scrape::TestCase
|
|
4
|
+
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
|
5
|
+
|
|
6
|
+
test "#load_scrapefile should parse the specified file" do
|
|
7
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
|
8
|
+
app = Scrape::Application.new(filepath)
|
|
9
|
+
assert app.load_scrapefile, "scrape file failed to load"
|
|
10
|
+
assert_equal ["http://example.com"], app.sites.keys
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
test "#load_scrapefile should return nil when already loaded" do
|
|
14
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
|
15
|
+
app = Scrape::Application.new(filepath)
|
|
16
|
+
assert app.load_scrapefile, "scrape file failed to load"
|
|
17
|
+
refute app.load_scrapefile, "scrape file should not have loaded again"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
test "#[] should return the site that matches the given url" do
|
|
21
|
+
site1 = Scrape::Site.new "http://example.com"
|
|
22
|
+
site2 = Scrape::Site.new "http://example.org"
|
|
23
|
+
app = Scrape::Application.new(".")
|
|
24
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
|
25
|
+
assert_equal site1, app["http://example.com"]
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
test "#[] should return the site that is relative to the given url" do
|
|
29
|
+
site1 = Scrape::Site.new "http://example.com"
|
|
30
|
+
site2 = Scrape::Site.new "http://example.org"
|
|
31
|
+
app = Scrape::Application.new(".")
|
|
32
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
|
33
|
+
assert_equal site1, app["http://example.com/test"]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
test "#[] should return nil when no site matches the given url" do
|
|
37
|
+
site1 = Scrape::Site.new "http://example.com"
|
|
38
|
+
site2 = Scrape::Site.new "http://example.org"
|
|
39
|
+
app = Scrape::Application.new(".")
|
|
40
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
|
41
|
+
assert_nil app["http://example.net"]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
test "#reset should enqueue the sites that have been defined" do
|
|
45
|
+
site1 = Scrape::Site.new "http://example.com"
|
|
46
|
+
site2 = Scrape::Site.new "http://example.org"
|
|
47
|
+
app = Scrape::Application.new(".")
|
|
48
|
+
app.sites.update site1.to_s => site1, site2.to_s => site2
|
|
49
|
+
app.reset
|
|
50
|
+
assert_equal ["http://example.com", "http://example.org"], app.queue
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
test "#run should load the specified file" do
|
|
54
|
+
filepath = File.join(SUPPORT_FILES, 'test1.scrape')
|
|
55
|
+
test_loader = MiniTest::Mock.new
|
|
56
|
+
test_loader.expect :load, nil, [filepath]
|
|
57
|
+
Scrape::Application.new(filepath, test_loader).run
|
|
58
|
+
assert test_loader.verify, "loader did not receive file"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
test "#enqueue should add the given url to the queue" do
|
|
62
|
+
app = Scrape::Application.new(".")
|
|
63
|
+
app.enqueue "http://example.com"
|
|
64
|
+
assert_equal ["http://example.com"], app.queue
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
test "#enqueue should not add the given to the queue when it already is added" do
|
|
68
|
+
app = Scrape::Application.new(".")
|
|
69
|
+
3.times{ app.enqueue "http://example.com" }
|
|
70
|
+
assert_equal ["http://example.com"], app.queue
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class DefaultLoaderTest < Scrape::TestCase
|
|
4
|
+
SUPPORT_FILES = File.expand_path File.join(File.dirname(__FILE__), '..', 'support')
|
|
5
|
+
|
|
6
|
+
test "#load should return sites parsed from the specified file" do
|
|
7
|
+
loader = Scrape::DefaultLoader.new
|
|
8
|
+
sites = loader.load File.join(SUPPORT_FILES, "test1.scrape")
|
|
9
|
+
assert_equal ["http://example.com"], sites.keys
|
|
10
|
+
assert_instance_of Scrape::Site, sites.values[0]
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
test "#load should return an empty hash when no matches have been defined" do
|
|
14
|
+
loader = Scrape::DefaultLoader.new
|
|
15
|
+
sites = loader.load File.join(SUPPORT_FILES, "test2.scrape")
|
|
16
|
+
assert_equal Hash.new, sites
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
test "#load should raise an error when no site is defined" do
|
|
20
|
+
loader = Scrape::DefaultLoader.new
|
|
21
|
+
assert_raises ArgumentError do
|
|
22
|
+
loader.load File.join(SUPPORT_FILES, "test3.scrape")
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class MatchTest < Scrape::TestCase
|
|
4
|
+
test "#initialize should raise error when proc's arity isn't one" do
|
|
5
|
+
assert_raises ArgumentError do
|
|
6
|
+
Scrape::Match.new("test"){ "no arguments" }
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
test "#invoke should call the proc" do
|
|
11
|
+
ok = false
|
|
12
|
+
match = Scrape::Match.new("test"){|doc| ok = true }
|
|
13
|
+
match.invoke nil
|
|
14
|
+
assert ok, "Proc was not called"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
test "#invoke should pass the document to the proc" do
|
|
18
|
+
doc = "yay"
|
|
19
|
+
ok = false
|
|
20
|
+
match = Scrape::Match.new("test"){|d| ok = (doc == d) }
|
|
21
|
+
match.invoke doc
|
|
22
|
+
assert ok, "Document was not passed into the proc"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
test "#=~ should return true when contains string" do
|
|
26
|
+
match = Scrape::Match.new("bar"){|doc|}
|
|
27
|
+
assert match =~ "foobar", "Expected true"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
test "#=~ should return false when doesn't contains string" do
|
|
31
|
+
match = Scrape::Match.new("bar"){|doc|}
|
|
32
|
+
refute match =~ "ponies", "Expected false"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
test "#=~ should return true when matches regexp" do
|
|
36
|
+
match = Scrape::Match.new(/bar/){|doc|}
|
|
37
|
+
assert match =~ "foobar", "Expected true"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
test "#=~ should return false when doesn't match regexp" do
|
|
41
|
+
match = Scrape::Match.new(/bar/){|doc|}
|
|
42
|
+
refute match =~ "ponies", "Expected false"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
test "#=~ should return true when proc is truthy" do
|
|
46
|
+
match = Scrape::Match.new(lambda{|url| true }){|doc|}
|
|
47
|
+
assert match =~ "ponies", "Expected true"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
test "#=~ should return false when proc is falsy" do
|
|
51
|
+
match = Scrape::Match.new(lambda{|url| false }){|doc|}
|
|
52
|
+
refute match =~ "ponies", "Expected false"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class SiteTest < Scrape::TestCase
|
|
4
|
+
test "#add_match should create a Match object and be added to the collection" do
|
|
5
|
+
site = Scrape::Site.new "http://www.example.com"
|
|
6
|
+
match = site.add_match("/test") { |doc| }
|
|
7
|
+
assert_instance_of Scrape::Match, match
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
test "#parse should return absolute urls that match the site's url" do
|
|
11
|
+
stub_request(:get, "http://www.example.com/test").
|
|
12
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
|
13
|
+
to_return(:status => 200, :body => <<-HTML)
|
|
14
|
+
<html>
|
|
15
|
+
<body>
|
|
16
|
+
<a href="http://www.example.com/link1.html">link 1</a>
|
|
17
|
+
<a href="http://example.com/link2.html">link 2</a>
|
|
18
|
+
<a href="http://example.org/link3.html">link 3</a>
|
|
19
|
+
</body>
|
|
20
|
+
</html>
|
|
21
|
+
HTML
|
|
22
|
+
|
|
23
|
+
site = Scrape::Site.new "http://www.example.com"
|
|
24
|
+
assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
test "#parse should return relative urls to the site" do
|
|
28
|
+
stub_request(:get, "http://www.example.com/test").
|
|
29
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
|
30
|
+
to_return(:status => 200, :body => <<-HTML)
|
|
31
|
+
<html>
|
|
32
|
+
<body>
|
|
33
|
+
<a href="link1.html">link 1</a>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
36
|
+
HTML
|
|
37
|
+
|
|
38
|
+
site = Scrape::Site.new "http://www.example.com"
|
|
39
|
+
assert_equal [Scrape::URI.new("http://www.example.com/link1.html")], site.parse("/test")
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
test "#parse should return no urls" do
|
|
43
|
+
stub_request(:get, "http://www.example.com/test").
|
|
44
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
|
45
|
+
to_return(:status => 200, :body => <<-HTML)
|
|
46
|
+
<html>
|
|
47
|
+
<body>
|
|
48
|
+
<a href="/link1.html">link 1</a>
|
|
49
|
+
</body>
|
|
50
|
+
</html>
|
|
51
|
+
HTML
|
|
52
|
+
|
|
53
|
+
site = Scrape::Site.new "http://www.example.com/test"
|
|
54
|
+
assert_equal [], site.parse("/test")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
test "#parse should invoke Match when hit" do
|
|
58
|
+
stub_request(:get, "http://www.example.com/test").
|
|
59
|
+
with(:headers => {"User-Agent" => Scrape.user_agent}).
|
|
60
|
+
to_return(:status => 200, :body => <<-HTML)
|
|
61
|
+
<html>
|
|
62
|
+
<body>
|
|
63
|
+
<a href="link1.html">link 1</a>
|
|
64
|
+
</body>
|
|
65
|
+
</html>
|
|
66
|
+
HTML
|
|
67
|
+
|
|
68
|
+
ok = false
|
|
69
|
+
site = Scrape::Site.new "http://www.example.com"
|
|
70
|
+
site.add_match(/test/){|doc| ok = true }
|
|
71
|
+
site.parse "/test"
|
|
72
|
+
|
|
73
|
+
assert ok, "Match was not invoked"
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class URITest < Scrape::TestCase
|
|
4
|
+
{fragment: "blah", host: "www.example.com", password: "secret", path: "/dot", query: "foo=bar", scheme: "http", user: "chuck", relative?: false, absolute?: true}.each do |method_name, value|
|
|
5
|
+
test "##{method_name} should return value" do
|
|
6
|
+
uri = Scrape::URI.new "http://chuck:secret@www.example.com/dot?foo=bar#blah"
|
|
7
|
+
assert_equal value, uri.send(method_name)
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
test "#open should return the contents at the url" do
|
|
12
|
+
stub_request(:get, "http://www.example.com/").with(headers: {"User-Agent" => Scrape.user_agent}).to_return(status: 200, body: "Howdie")
|
|
13
|
+
|
|
14
|
+
uri = Scrape::URI.new "http://www.example.com"
|
|
15
|
+
assert_equal "Howdie", uri.open
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
test "#+ should return a URI with the specified path" do
|
|
19
|
+
uri1 = Scrape::URI.new "http://www.example.com"
|
|
20
|
+
uri2 = uri1 + "/bar"
|
|
21
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
test "#+ should return a URI overwriting with the specified path" do
|
|
25
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
|
26
|
+
uri2 = uri1 + "/bar"
|
|
27
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
test "#+ should return a URI with the specified path appended" do
|
|
31
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
|
32
|
+
uri2 = uri1 + "bar"
|
|
33
|
+
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
test "#+ should return a URI from the absolute url" do
|
|
37
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
|
38
|
+
uri2 = uri1 + "http://www.example.com/bar"
|
|
39
|
+
assert_equal "http://www.example.com/bar", uri2.to_s
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
test "#+ should return a URI appended from the absolute url" do
|
|
43
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
|
44
|
+
uri2 = uri1 + "http://www.example.com/foo/bar"
|
|
45
|
+
assert_equal "http://www.example.com/foo/bar", uri2.to_s
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
test "#< should return true when specified url is greater" do
|
|
49
|
+
uri1 = Scrape::URI.new "http://www.example.com/foo"
|
|
50
|
+
assert uri1 < "http://www.example.com/foo/bar"
|
|
51
|
+
end
|
|
52
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: scrape
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: '0.1'
|
|
5
|
+
prerelease:
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Marty Zalega
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: nokogiri
|
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
|
17
|
+
none: false
|
|
18
|
+
requirements:
|
|
19
|
+
- - ~>
|
|
20
|
+
- !ruby/object:Gem::Version
|
|
21
|
+
version: 1.5.5
|
|
22
|
+
type: :development
|
|
23
|
+
prerelease: false
|
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
25
|
+
none: false
|
|
26
|
+
requirements:
|
|
27
|
+
- - ~>
|
|
28
|
+
- !ruby/object:Gem::Version
|
|
29
|
+
version: 1.5.5
|
|
30
|
+
description: An easy to use utility to scrape websites using a DSL similar to rake.
|
|
31
|
+
email:
|
|
32
|
+
- evilmarty@gmail.com
|
|
33
|
+
executables:
|
|
34
|
+
- scrape
|
|
35
|
+
extensions: []
|
|
36
|
+
extra_rdoc_files: []
|
|
37
|
+
files:
|
|
38
|
+
- .gitignore
|
|
39
|
+
- Gemfile
|
|
40
|
+
- Gemfile.lock
|
|
41
|
+
- LICENSE
|
|
42
|
+
- README.md
|
|
43
|
+
- Rakefile
|
|
44
|
+
- bin/scrape
|
|
45
|
+
- examples/github.scrape
|
|
46
|
+
- examples/google.scrape
|
|
47
|
+
- lib/scrape.rb
|
|
48
|
+
- lib/scrape/application.rb
|
|
49
|
+
- lib/scrape/cli.rb
|
|
50
|
+
- lib/scrape/default_loader.rb
|
|
51
|
+
- lib/scrape/dsl.rb
|
|
52
|
+
- lib/scrape/match.rb
|
|
53
|
+
- lib/scrape/site.rb
|
|
54
|
+
- lib/scrape/uri.rb
|
|
55
|
+
- lib/scrape/version.rb
|
|
56
|
+
- scrape.gemspec
|
|
57
|
+
- test/support/test1.scrape
|
|
58
|
+
- test/support/test2.scrape
|
|
59
|
+
- test/support/test3.scrape
|
|
60
|
+
- test/test_helper.rb
|
|
61
|
+
- test/unit/application_test.rb
|
|
62
|
+
- test/unit/default_loader_test.rb
|
|
63
|
+
- test/unit/match_test.rb
|
|
64
|
+
- test/unit/site_test.rb
|
|
65
|
+
- test/unit/uri_test.rb
|
|
66
|
+
homepage: http://github.com/evilmarty/scrape
|
|
67
|
+
licenses: []
|
|
68
|
+
post_install_message:
|
|
69
|
+
rdoc_options: []
|
|
70
|
+
require_paths:
|
|
71
|
+
- lib
|
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
73
|
+
none: false
|
|
74
|
+
requirements:
|
|
75
|
+
- - ! '>='
|
|
76
|
+
- !ruby/object:Gem::Version
|
|
77
|
+
version: '0'
|
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
|
+
none: false
|
|
80
|
+
requirements:
|
|
81
|
+
- - ! '>='
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: '0'
|
|
84
|
+
requirements: []
|
|
85
|
+
rubyforge_project: scrape
|
|
86
|
+
rubygems_version: 1.8.23
|
|
87
|
+
signing_key:
|
|
88
|
+
specification_version: 3
|
|
89
|
+
summary: A really simple web scraper
|
|
90
|
+
test_files: []
|