eagleclaw 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +4 -0
- data/bin/eagleclaw +5 -0
- data/lib/eagleclaw/browser.rb +24 -0
- data/lib/eagleclaw/callbacks.rb +64 -0
- data/lib/eagleclaw/constmatch.rb +30 -0
- data/lib/eagleclaw/formatters/csv.rb +19 -0
- data/lib/eagleclaw/formatters/json.rb +13 -0
- data/lib/eagleclaw/formatters.rb +18 -0
- data/lib/eagleclaw/runner.rb +50 -0
- data/lib/eagleclaw/scrapers.rb +9 -0
- data/lib/eagleclaw/string.rb +15 -0
- data/lib/eagleclaw/xml.rb +38 -0
- data/lib/eagleclaw.rb +139 -0
- metadata +130 -0
data/README.md
ADDED
data/bin/eagleclaw
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module EagleClaw
|
4
|
+
##
|
5
|
+
# Add browsing capabilities to a class.
|
6
|
+
module Browser
|
7
|
+
##
|
8
|
+
# This browser's `Mechanize` agent.
|
9
|
+
#
|
10
|
+
# @return [Mechanize]
|
11
|
+
#
|
12
|
+
# @see http://mechanize.rubyforge.org/mechanize/
|
13
|
+
def agent
|
14
|
+
@agent ||= Mechanize.new
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# If a non-existent method is called, pass it along to the `Mechanize`
|
19
|
+
# agent.
|
20
|
+
def method_missing(name, *args, &block)
|
21
|
+
agent.send(name, *args, &block)
|
22
|
+
end
|
23
|
+
end # module Browser
|
24
|
+
end # module EagleClaw
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module EagleClaw
|
2
|
+
module Callbacks
|
3
|
+
##
|
4
|
+
# Register a callback for a given context.
|
5
|
+
#
|
6
|
+
# @param [Object] context
|
7
|
+
# an object (such as a symbol or list of symbols) which refers to a
|
8
|
+
# certain callback context.
|
9
|
+
# @param [optional, Symbol] meth the name of the callback method.
|
10
|
+
#
|
11
|
+
# @overload register(context, :method_name)
|
12
|
+
# Register a method as a callback.
|
13
|
+
# @overload register(context, &block)
|
14
|
+
# Register a block as a callback.
|
15
|
+
#
|
16
|
+
# @example Registering a method as a callback
|
17
|
+
# class MyCLS
|
18
|
+
# include Callbacks
|
19
|
+
#
|
20
|
+
# register(:preprocessing, :setup_db)
|
21
|
+
#
|
22
|
+
# def setup_db
|
23
|
+
# @database = DB.connect("username", "password")
|
24
|
+
# end
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
# @example Registering a block as a callback
|
28
|
+
# class MyCLS
|
29
|
+
# include Callbacks
|
30
|
+
#
|
31
|
+
# register :preprocessing do
|
32
|
+
# @database = DB.connect("username", "password")
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
def register(context, meth = nil, &block)
|
37
|
+
callback = block_given? ? block : meth
|
38
|
+
((@callbacks ||= {})[context] ||= []) << callback
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Run the callbacks for a given context.
|
43
|
+
#
|
44
|
+
# @param context
|
45
|
+
# @param [optional, Object] recipient the object to run methods/procs on.
|
46
|
+
# @return [nil]
|
47
|
+
def run_callbacks(context, recipient = self)
|
48
|
+
(@callbacks[context] || []).each { |callback| run_proc(callback, recipient) }
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
##
|
55
|
+
# Run a given method or `Proc` on an instance.
|
56
|
+
def run_proc(meth, recipient = self)
|
57
|
+
if meth.is_a? Symbol
|
58
|
+
recipient.send(meth)
|
59
|
+
else
|
60
|
+
recipient.instance_eval(&meth)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EagleClaw
|
2
|
+
##
|
3
|
+
# Mixin to allow an object to act as a case-insensitive hash against its own
|
4
|
+
# constants. This should be used with singleton metaclasses, like so:
|
5
|
+
#
|
6
|
+
# class MyClass
|
7
|
+
# class << self
|
8
|
+
# include EagleClaw::ConstantMatcher
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
module ConstantMatcher
|
13
|
+
##
|
14
|
+
# Retrieve the constant for the given symbol. Matching will be performed
|
15
|
+
# case-insensitively.
|
16
|
+
#
|
17
|
+
# @param [Symbol] constant
|
18
|
+
# A symbol referring to the constant to get.
|
19
|
+
#
|
20
|
+
# @example Get the JSON EagleClaw formatter
|
21
|
+
# EagleClaw::Formatters[:json] => EagleClaw::Formatters::JSON
|
22
|
+
#
|
23
|
+
def [](constant)
|
24
|
+
constants.each do |const|
|
25
|
+
return const_get(const) if const.downcase == constant.to_s.downcase
|
26
|
+
end
|
27
|
+
raise NameError, "Constant #{constant.inspect} not found"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module EagleClaw::Formatters
|
4
|
+
module CSV
|
5
|
+
include ::EagleClaw::Formatter
|
6
|
+
|
7
|
+
def self.format(data, problems)
|
8
|
+
keys = data.first.keys
|
9
|
+
s = StringIO.new
|
10
|
+
::CSV::Writer.generate(s) do |out|
|
11
|
+
out << keys
|
12
|
+
data.each do |datum|
|
13
|
+
out << keys.map(&datum.method(:[]))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
s.string
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'eagleclaw/constmatch'
|
2
|
+
|
3
|
+
module EagleClaw
|
4
|
+
##
|
5
|
+
# This is just a placeholder at the moment; you should include it in your
|
6
|
+
# formatter class in case it gets some useful methods in the future.
|
7
|
+
module Formatter
|
8
|
+
end
|
9
|
+
|
10
|
+
module Formatters
|
11
|
+
class << self
|
12
|
+
include EagleClaw::ConstantMatcher
|
13
|
+
end
|
14
|
+
|
15
|
+
autoload :JSON, 'eagleclaw/formatters/json'
|
16
|
+
autoload :CSV, 'eagleclaw/formatters/csv'
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
module EagleClaw
|
5
|
+
class Runner
|
6
|
+
attr_accessor :options
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@options = OpenStruct.new(:require => [], :format => :json)
|
10
|
+
end
|
11
|
+
|
12
|
+
def parser
|
13
|
+
@parser ||= OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{$0} [options] ScraperName"
|
15
|
+
|
16
|
+
# ---- MAIN ----
|
17
|
+
|
18
|
+
opts.on('-r', '--require LIBRARY',
|
19
|
+
"Require LIBRARY before loading scrapers") do |lib|
|
20
|
+
options.require << lib
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on('-o', '--output FORMAT',
|
24
|
+
"Output data in format FORMAT (default: JSON)") do |format|
|
25
|
+
options.format = format.downcase.to_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
# ---- TAIL ----
|
29
|
+
|
30
|
+
opts.on_tail('-h', '--help', "Show this help message") { puts opts and exit }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse!(args = ARGV)
|
35
|
+
parser.parse!(args)
|
36
|
+
end
|
37
|
+
|
38
|
+
def run!(args = ARGV)
|
39
|
+
parse!(args)
|
40
|
+
@options.require.each { |lib| require(lib) }
|
41
|
+
formatter = EagleClaw::Formatters[@options.format]
|
42
|
+
|
43
|
+
scraper = EagleClaw::Scrapers[args.shift.downcase.to_sym].new
|
44
|
+
scraper.run
|
45
|
+
data, problems = scraper.data, scraper.problems
|
46
|
+
output = formatter.format(data, problems)
|
47
|
+
puts output
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
class String
|
4
|
+
def starts_with?(string)
|
5
|
+
!! (self =~ Regexp.new('^' + Regexp.escape(string)))
|
6
|
+
end
|
7
|
+
|
8
|
+
def ends_with?(string)
|
9
|
+
!! (self =~ Regexp.new(Regexp.escape(string) + '$'))
|
10
|
+
end
|
11
|
+
|
12
|
+
def sha1
|
13
|
+
(Digest::SHA1.new << self).hexdigest
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Nokogiri::XML::Element
|
4
|
+
|
5
|
+
##
|
6
|
+
# Keep consuming elements until the block returns `true`.
|
7
|
+
#
|
8
|
+
# @param [Symbol] method
|
9
|
+
# The method to call on the current element to get the next one. The default
|
10
|
+
# is to use `:next_element`. Use `:next` to include text elements in the
|
11
|
+
# iteration.
|
12
|
+
#
|
13
|
+
# @yield [Nokogiri::XML::Element] element
|
14
|
+
# The current element. If the block returns `true` (or any non-false value)
|
15
|
+
# then this is what the method will return.
|
16
|
+
def next_until(method = :next_element)
|
17
|
+
current = self
|
18
|
+
until yield(current)
|
19
|
+
current = current.send(method)
|
20
|
+
end
|
21
|
+
current
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Keep consuming elements until the block returns `false`.
|
26
|
+
#
|
27
|
+
# The behaviour of this method is identical to {#next_until}, only it will
|
28
|
+
# keep iterating until the block yields a *false* value instead of a true one.
|
29
|
+
#
|
30
|
+
# @see #next_until
|
31
|
+
def next_while(method = :next_element)
|
32
|
+
current = self
|
33
|
+
while yield(current)
|
34
|
+
current = current.send(method)
|
35
|
+
end
|
36
|
+
current
|
37
|
+
end
|
38
|
+
end
|
data/lib/eagleclaw.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'eagleclaw/browser'
|
2
|
+
require 'eagleclaw/callbacks'
|
3
|
+
require 'eagleclaw/formatters'
|
4
|
+
require 'eagleclaw/scrapers'
|
5
|
+
require 'eagleclaw/string'
|
6
|
+
require 'eagleclaw/xml'
|
7
|
+
|
8
|
+
module EagleClaw
|
9
|
+
class Scraper
|
10
|
+
class << self
|
11
|
+
include ::EagleClaw::Callbacks
|
12
|
+
|
13
|
+
attr_accessor :properties
|
14
|
+
|
15
|
+
##
|
16
|
+
# Define a pre-processor to run in a certain context.
|
17
|
+
#
|
18
|
+
# @param [Symbol] context either `:each` or `:all`.
|
19
|
+
# @param [optional, Symbol] meth name of method to call.
|
20
|
+
# @return [nil]
|
21
|
+
#
|
22
|
+
# @overload before(:each, :method_name)
|
23
|
+
# Run the given method before each component of the run.
|
24
|
+
# @overload before(:all, :method_name)
|
25
|
+
# Run the given method before the run itself.
|
26
|
+
# @overload before(:each, &block)
|
27
|
+
# Run the given block (using `instance_eval`) before each component of
|
28
|
+
# the run.
|
29
|
+
# @overload before(:all, &block)
|
30
|
+
# Run the given block (using `instance_eval`) before the run itself.
|
31
|
+
#
|
32
|
+
# @example Fetch a page before the run
|
33
|
+
# before(:all) do
|
34
|
+
# agent.get("http://google.com/")
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# @example Reset the page before each component of the run
|
38
|
+
# before(:each) do
|
39
|
+
# agent.get("http://google.com/")
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
def before(context, meth = nil, &block)
|
43
|
+
register([:before, context], meth, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Define a post-processor to run in a certain context.
|
48
|
+
#
|
49
|
+
# @param [Symbol] context either `:each` or `:all`.
|
50
|
+
# @param [optional, Symbol] meth name of method to call.
|
51
|
+
# @return [nil]
|
52
|
+
#
|
53
|
+
# @overload after(:each, :method_name)
|
54
|
+
# Run the given method after each component of the run.
|
55
|
+
# @overload after(:all, :method_name)
|
56
|
+
# Run the given method after the run itself.
|
57
|
+
# @overload after(:each, &block)
|
58
|
+
# Run the given block (using `instance_eval`) after each component of
|
59
|
+
# the run.
|
60
|
+
# @overload after(:all, &block)
|
61
|
+
# Run the given block (using `instance_eval`) after the entire run.
|
62
|
+
#
|
63
|
+
# @see before
|
64
|
+
#
|
65
|
+
def after(context, meth = nil, &block)
|
66
|
+
register([:after, context], meth, &block)
|
67
|
+
end
|
68
|
+
|
69
|
+
def prop(prop_name, meth = nil, &block)
|
70
|
+
(@properties ||= []) << prop_name.to_sym
|
71
|
+
register([:property, prop_name.to_sym], meth, &block)
|
72
|
+
end
|
73
|
+
end # class << self (Scraper)
|
74
|
+
|
75
|
+
include Browser
|
76
|
+
|
77
|
+
##
|
78
|
+
# A `Hash` which holds data collected during a run.
|
79
|
+
#
|
80
|
+
# @see #initialize
|
81
|
+
# @see #reset
|
82
|
+
attr_accessor :data
|
83
|
+
|
84
|
+
##
|
85
|
+
# An `Array` which collects
|
86
|
+
attr_accessor :problems
|
87
|
+
|
88
|
+
##
|
89
|
+
# Create a new {Scraper} instance.
|
90
|
+
#
|
91
|
+
# By default, just sets {#data @data} and {#problems @problems} to empty
|
92
|
+
# `Array`s.
|
93
|
+
#
|
94
|
+
def initialize
|
95
|
+
@data = []
|
96
|
+
@problems = []
|
97
|
+
end
|
98
|
+
|
99
|
+
##
|
100
|
+
# Reset this scraper instance's state.
|
101
|
+
#
|
102
|
+
# The default version of this method just clears {#data @data} and
|
103
|
+
# {#problems @problems}.
|
104
|
+
#
|
105
|
+
# @return [nil]
|
106
|
+
# @abstract Subclass and extend to reset the scraper state.
|
107
|
+
#
|
108
|
+
def reset
|
109
|
+
data.clear
|
110
|
+
problems.clear
|
111
|
+
end
|
112
|
+
|
113
|
+
##
|
114
|
+
# Run the scraper.
|
115
|
+
#
|
116
|
+
# Operating procedure:
|
117
|
+
#
|
118
|
+
# 1. Run {Scraper.before before(:all)} blocks.
|
119
|
+
# 2. For each property (defined with {Scraper.prop prop(:prop_name)}):
|
120
|
+
# 1. Run {Scraper.before before(:each)} blocks.
|
121
|
+
# 2. Run the property itself.
|
122
|
+
# 3. Runs {Scraper.after after(:each)} blocks.
|
123
|
+
# 3. Runs {Scraper.after after(:all)} blocks.
|
124
|
+
# 4. Return {#data data}.
|
125
|
+
#
|
126
|
+
# @see #reset
|
127
|
+
# @see #data
|
128
|
+
def run
|
129
|
+
self.class.run_callbacks([:before, :all], self)
|
130
|
+
self.class.properties.each do |property|
|
131
|
+
self.class.run_callbacks([:before, :each], self)
|
132
|
+
self.class.run_callbacks([:property, property], self)
|
133
|
+
self.class.run_callbacks([:after, :each], self)
|
134
|
+
end
|
135
|
+
self.class.run_callbacks([:after, :all], self)
|
136
|
+
data
|
137
|
+
end
|
138
|
+
end # class Scraper
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: eagleclaw
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Zachary Voase
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-05-10 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mechanize
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
version: 1.0.0
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: nokogiri
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 1
|
43
|
+
- 4
|
44
|
+
- 1
|
45
|
+
version: 1.4.1
|
46
|
+
type: :runtime
|
47
|
+
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rspec
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
segments:
|
56
|
+
- 1
|
57
|
+
- 3
|
58
|
+
- 0
|
59
|
+
version: 1.3.0
|
60
|
+
type: :development
|
61
|
+
version_requirements: *id003
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
prerelease: false
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
- 5
|
72
|
+
- 4
|
73
|
+
version: 0.5.4
|
74
|
+
type: :development
|
75
|
+
version_requirements: *id004
|
76
|
+
description: eagleclaw is a small library to help build screen-scrapers
|
77
|
+
email: z@zacharyvoase.com
|
78
|
+
executables:
|
79
|
+
- eagleclaw
|
80
|
+
extensions: []
|
81
|
+
|
82
|
+
extra_rdoc_files: []
|
83
|
+
|
84
|
+
files:
|
85
|
+
- README.md
|
86
|
+
- lib/eagleclaw/browser.rb
|
87
|
+
- lib/eagleclaw/callbacks.rb
|
88
|
+
- lib/eagleclaw/constmatch.rb
|
89
|
+
- lib/eagleclaw/formatters/csv.rb
|
90
|
+
- lib/eagleclaw/formatters/json.rb
|
91
|
+
- lib/eagleclaw/formatters.rb
|
92
|
+
- lib/eagleclaw/runner.rb
|
93
|
+
- lib/eagleclaw/scrapers.rb
|
94
|
+
- lib/eagleclaw/string.rb
|
95
|
+
- lib/eagleclaw/xml.rb
|
96
|
+
- lib/eagleclaw.rb
|
97
|
+
has_rdoc: false
|
98
|
+
homepage: http://github.com/zacharyvoase/eagleclaw
|
99
|
+
licenses:
|
100
|
+
- Public Domain
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
segments:
|
111
|
+
- 1
|
112
|
+
- 8
|
113
|
+
- 6
|
114
|
+
version: 1.8.6
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project:
|
125
|
+
rubygems_version: 1.3.6
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: A small screen-scraping library
|
129
|
+
test_files: []
|
130
|
+
|