eagleclaw 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +4 -0
- data/bin/eagleclaw +5 -0
- data/lib/eagleclaw/browser.rb +24 -0
- data/lib/eagleclaw/callbacks.rb +64 -0
- data/lib/eagleclaw/constmatch.rb +30 -0
- data/lib/eagleclaw/formatters/csv.rb +19 -0
- data/lib/eagleclaw/formatters/json.rb +13 -0
- data/lib/eagleclaw/formatters.rb +18 -0
- data/lib/eagleclaw/runner.rb +50 -0
- data/lib/eagleclaw/scrapers.rb +9 -0
- data/lib/eagleclaw/string.rb +15 -0
- data/lib/eagleclaw/xml.rb +38 -0
- data/lib/eagleclaw.rb +139 -0
- metadata +130 -0
data/README.md
ADDED
data/bin/eagleclaw
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module EagleClaw
|
4
|
+
##
|
5
|
+
# Add browsing capabilities to a class.
|
6
|
+
module Browser
|
7
|
+
##
|
8
|
+
# This browser's `Mechanize` agent.
|
9
|
+
#
|
10
|
+
# @return [Mechanize]
|
11
|
+
#
|
12
|
+
# @see http://mechanize.rubyforge.org/mechanize/
|
13
|
+
def agent
|
14
|
+
@agent ||= Mechanize.new
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# If a non-existent method is called, pass it along to the `Mechanize`
|
19
|
+
# agent.
|
20
|
+
def method_missing(name, *args, &block)
|
21
|
+
agent.send(name, *args, &block)
|
22
|
+
end
|
23
|
+
end # module Browser
|
24
|
+
end # module EagleClaw
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module EagleClaw
|
2
|
+
module Callbacks
|
3
|
+
##
|
4
|
+
# Register a callback for a given context.
|
5
|
+
#
|
6
|
+
# @param [Object] context
|
7
|
+
# an object (such as a symbol or list of symbols) which refers to a
|
8
|
+
# certain callback context.
|
9
|
+
# @param [optional, Symbol] meth the name of the callback method.
|
10
|
+
#
|
11
|
+
# @overload register(context, :method_name)
|
12
|
+
# Register a method as a callback.
|
13
|
+
# @overload register(context, &block)
|
14
|
+
# Register a block as a callback.
|
15
|
+
#
|
16
|
+
# @example Registering a method as a callback
|
17
|
+
# class MyCLS
|
18
|
+
# include Callbacks
|
19
|
+
#
|
20
|
+
# register(:preprocessing, :setup_db)
|
21
|
+
#
|
22
|
+
# def setup_db
|
23
|
+
# @database = DB.connect("username", "password")
|
24
|
+
# end
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
# @example Registering a block as a callback
|
28
|
+
# class MyCLS
|
29
|
+
# include Callbacks
|
30
|
+
#
|
31
|
+
# register :preprocessing do
|
32
|
+
# @database = DB.connect("username", "password")
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
def register(context, meth = nil, &block)
|
37
|
+
callback = block_given? ? block : meth
|
38
|
+
((@callbacks ||= {})[context] ||= []) << callback
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Run the callbacks for a given context.
|
43
|
+
#
|
44
|
+
# @param context
|
45
|
+
# @param [optional, Object] recipient the object to run methods/procs on.
|
46
|
+
# @return [nil]
|
47
|
+
def run_callbacks(context, recipient = self)
|
48
|
+
(@callbacks[context] || []).each { |callback| run_proc(callback, recipient) }
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
##
|
55
|
+
# Run a given method or `Proc` on an instance.
|
56
|
+
def run_proc(meth, recipient = self)
|
57
|
+
if meth.is_a? Symbol
|
58
|
+
recipient.send(meth)
|
59
|
+
else
|
60
|
+
recipient.instance_eval(&meth)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EagleClaw
|
2
|
+
##
|
3
|
+
# Mixin to allow an object to act as a case-insensitive hash against its own
|
4
|
+
# constants. This should be used with singleton metaclasses, like so:
|
5
|
+
#
|
6
|
+
# class MyClass
|
7
|
+
# class << self
|
8
|
+
# include EagleClaw::ConstantMatcher
|
9
|
+
# end
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
module ConstantMatcher
|
13
|
+
##
|
14
|
+
# Retrieve the constant for the given symbol. Matching will be performed
|
15
|
+
# case-insensitively.
|
16
|
+
#
|
17
|
+
# @param [Symbol] constant
|
18
|
+
# A symbol referring to the constant to get.
|
19
|
+
#
|
20
|
+
# @example Get the JSON EagleClaw formatter
|
21
|
+
# EagleClaw::Formatters[:json] => EagleClaw::Formatters::JSON
|
22
|
+
#
|
23
|
+
def [](constant)
|
24
|
+
constants.each do |const|
|
25
|
+
return const_get(const) if const.downcase == constant.to_s.downcase
|
26
|
+
end
|
27
|
+
raise NameError, "Constant #{constant.inspect} not found"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module EagleClaw::Formatters
|
4
|
+
module CSV
|
5
|
+
include ::EagleClaw::Formatter
|
6
|
+
|
7
|
+
def self.format(data, problems)
|
8
|
+
keys = data.first.keys
|
9
|
+
s = StringIO.new
|
10
|
+
::CSV::Writer.generate(s) do |out|
|
11
|
+
out << keys
|
12
|
+
data.each do |datum|
|
13
|
+
out << keys.map(&datum.method(:[]))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
s.string
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'eagleclaw/constmatch'
|
2
|
+
|
3
|
+
module EagleClaw
|
4
|
+
##
|
5
|
+
# This is just a placeholder at the moment; you should include it in your
|
6
|
+
# formatter class in case it gets some useful methods in the future.
|
7
|
+
module Formatter
|
8
|
+
end
|
9
|
+
|
10
|
+
module Formatters
|
11
|
+
class << self
|
12
|
+
include EagleClaw::ConstantMatcher
|
13
|
+
end
|
14
|
+
|
15
|
+
autoload :JSON, 'eagleclaw/formatters/json'
|
16
|
+
autoload :CSV, 'eagleclaw/formatters/csv'
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
module EagleClaw
|
5
|
+
class Runner
|
6
|
+
attr_accessor :options
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@options = OpenStruct.new(:require => [], :format => :json)
|
10
|
+
end
|
11
|
+
|
12
|
+
def parser
|
13
|
+
@parser ||= OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{$0} [options] ScraperName"
|
15
|
+
|
16
|
+
# ---- MAIN ----
|
17
|
+
|
18
|
+
opts.on('-r', '--require LIBRARY',
|
19
|
+
"Require LIBRARY before loading scrapers") do |lib|
|
20
|
+
options.require << lib
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on('-o', '--output FORMAT',
|
24
|
+
"Output data in format FORMAT (default: JSON)") do |format|
|
25
|
+
options.format = format.downcase.to_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
# ---- TAIL ----
|
29
|
+
|
30
|
+
opts.on_tail('-h', '--help', "Show this help message") { puts opts and exit }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def parse!(args = ARGV)
|
35
|
+
parser.parse!(args)
|
36
|
+
end
|
37
|
+
|
38
|
+
def run!(args = ARGV)
|
39
|
+
parse!(args)
|
40
|
+
@options.require.each { |lib| require(lib) }
|
41
|
+
formatter = EagleClaw::Formatters[@options.format]
|
42
|
+
|
43
|
+
scraper = EagleClaw::Scrapers[args.shift.downcase.to_sym].new
|
44
|
+
scraper.run
|
45
|
+
data, problems = scraper.data, scraper.problems
|
46
|
+
output = formatter.format(data, problems)
|
47
|
+
puts output
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
class String
|
4
|
+
def starts_with?(string)
|
5
|
+
!! (self =~ Regexp.new('^' + Regexp.escape(string)))
|
6
|
+
end
|
7
|
+
|
8
|
+
def ends_with?(string)
|
9
|
+
!! (self =~ Regexp.new(Regexp.escape(string) + '$'))
|
10
|
+
end
|
11
|
+
|
12
|
+
def sha1
|
13
|
+
(Digest::SHA1.new << self).hexdigest
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Nokogiri::XML::Element
|
4
|
+
|
5
|
+
##
|
6
|
+
# Keep consuming elements until the block returns `true`.
|
7
|
+
#
|
8
|
+
# @param [Symbol] method
|
9
|
+
# The method to call on the current element to get the next one. The default
|
10
|
+
# is to use `:next_element`. Use `:next` to include text elements in the
|
11
|
+
# iteration.
|
12
|
+
#
|
13
|
+
# @yield [Nokogiri::XML::Element] element
|
14
|
+
# The current element. If the block returns `true` (or any non-false value)
|
15
|
+
# then this is what the method will return.
|
16
|
+
def next_until(method = :next_element)
|
17
|
+
current = self
|
18
|
+
until yield(current)
|
19
|
+
current = current.send(method)
|
20
|
+
end
|
21
|
+
current
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Keep consuming elements until the block returns `false`.
|
26
|
+
#
|
27
|
+
# The behaviour of this method is identical to {#next_until}, only it will
|
28
|
+
# keep iterating until the block yields a *false* value instead of a true one.
|
29
|
+
#
|
30
|
+
# @see #next_until
|
31
|
+
def next_while(method = :next_element)
|
32
|
+
current = self
|
33
|
+
while yield(current)
|
34
|
+
current = current.send(method)
|
35
|
+
end
|
36
|
+
current
|
37
|
+
end
|
38
|
+
end
|
data/lib/eagleclaw.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'eagleclaw/browser'
|
2
|
+
require 'eagleclaw/callbacks'
|
3
|
+
require 'eagleclaw/formatters'
|
4
|
+
require 'eagleclaw/scrapers'
|
5
|
+
require 'eagleclaw/string'
|
6
|
+
require 'eagleclaw/xml'
|
7
|
+
|
8
|
+
module EagleClaw
|
9
|
+
class Scraper
|
10
|
+
class << self
|
11
|
+
include ::EagleClaw::Callbacks
|
12
|
+
|
13
|
+
attr_accessor :properties
|
14
|
+
|
15
|
+
##
|
16
|
+
# Define a pre-processor to run in a certain context.
|
17
|
+
#
|
18
|
+
# @param [Symbol] context either `:each` or `:all`.
|
19
|
+
# @param [optional, Symbol] meth name of method to call.
|
20
|
+
# @return [nil]
|
21
|
+
#
|
22
|
+
# @overload before(:each, :method_name)
|
23
|
+
# Run the given method before each component of the run.
|
24
|
+
# @overload before(:all, :method_name)
|
25
|
+
# Run the given method before the run itself.
|
26
|
+
# @overload before(:each, &block)
|
27
|
+
# Run the given block (using `instance_eval`) before each component of
|
28
|
+
# the run.
|
29
|
+
# @overload before(:all, &block)
|
30
|
+
# Run the given block (using `instance_eval`) before the run itself.
|
31
|
+
#
|
32
|
+
# @example Fetch a page before the run
|
33
|
+
# before(:all) do
|
34
|
+
# agent.get("http://google.com/")
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# @example Reset the page before each component of the run
|
38
|
+
# before(:each) do
|
39
|
+
# agent.get("http://google.com/")
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
def before(context, meth = nil, &block)
|
43
|
+
register([:before, context], meth, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# Define a post-processor to run in a certain context.
|
48
|
+
#
|
49
|
+
# @param [Symbol] context either `:each` or `:all`.
|
50
|
+
# @param [optional, Symbol] meth name of method to call.
|
51
|
+
# @return [nil]
|
52
|
+
#
|
53
|
+
# @overload after(:each, :method_name)
|
54
|
+
# Run the given method after each component of the run.
|
55
|
+
# @overload after(:all, :method_name)
|
56
|
+
# Run the given method after the run itself.
|
57
|
+
# @overload after(:each, &block)
|
58
|
+
# Run the given block (using `instance_eval`) after each component of
|
59
|
+
# the run.
|
60
|
+
# @overload after(:all, &block)
|
61
|
+
# Run the given block (using `instance_eval`) after the entire run.
|
62
|
+
#
|
63
|
+
# @see before
|
64
|
+
#
|
65
|
+
def after(context, meth = nil, &block)
|
66
|
+
register([:after, context], meth, &block)
|
67
|
+
end
|
68
|
+
|
69
|
+
def prop(prop_name, meth = nil, &block)
|
70
|
+
(@properties ||= []) << prop_name.to_sym
|
71
|
+
register([:property, prop_name.to_sym], meth, &block)
|
72
|
+
end
|
73
|
+
end # class << self (Scraper)
|
74
|
+
|
75
|
+
include Browser
|
76
|
+
|
77
|
+
##
|
78
|
+
# A `Hash` which holds data collected during a run.
|
79
|
+
#
|
80
|
+
# @see #initialize
|
81
|
+
# @see #reset
|
82
|
+
attr_accessor :data
|
83
|
+
|
84
|
+
##
|
85
|
+
# An `Array` which collects
|
86
|
+
attr_accessor :problems
|
87
|
+
|
88
|
+
##
|
89
|
+
# Create a new {Scraper} instance.
|
90
|
+
#
|
91
|
+
# By default, just sets {#data @data} and {#problems @problems} to empty
|
92
|
+
# `Array`s.
|
93
|
+
#
|
94
|
+
def initialize
|
95
|
+
@data = []
|
96
|
+
@problems = []
|
97
|
+
end
|
98
|
+
|
99
|
+
##
|
100
|
+
# Reset this scraper instance's state.
|
101
|
+
#
|
102
|
+
# The default version of this method just clears {#data @data} and
|
103
|
+
# {#problems @problems}.
|
104
|
+
#
|
105
|
+
# @return [nil]
|
106
|
+
# @abstract Subclass and extend to reset the scraper state.
|
107
|
+
#
|
108
|
+
def reset
|
109
|
+
data.clear
|
110
|
+
problems.clear
|
111
|
+
end
|
112
|
+
|
113
|
+
##
|
114
|
+
# Run the scraper.
|
115
|
+
#
|
116
|
+
# Operating procedure:
|
117
|
+
#
|
118
|
+
# 1. Run {Scraper.before before(:all)} blocks.
|
119
|
+
# 2. For each property (defined with {Scraper.prop prop(:prop_name)}):
|
120
|
+
# 1. Run {Scraper.before before(:each)} blocks.
|
121
|
+
# 2. Run the property itself.
|
122
|
+
# 3. Runs {Scraper.after after(:each)} blocks.
|
123
|
+
# 3. Runs {Scraper.after after(:all)} blocks.
|
124
|
+
# 4. Return {#data data}.
|
125
|
+
#
|
126
|
+
# @see #reset
|
127
|
+
# @see #data
|
128
|
+
def run
|
129
|
+
self.class.run_callbacks([:before, :all], self)
|
130
|
+
self.class.properties.each do |property|
|
131
|
+
self.class.run_callbacks([:before, :each], self)
|
132
|
+
self.class.run_callbacks([:property, property], self)
|
133
|
+
self.class.run_callbacks([:after, :each], self)
|
134
|
+
end
|
135
|
+
self.class.run_callbacks([:after, :all], self)
|
136
|
+
data
|
137
|
+
end
|
138
|
+
end # class Scraper
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: eagleclaw
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Zachary Voase
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-05-10 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mechanize
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
version: 1.0.0
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: nokogiri
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 1
|
43
|
+
- 4
|
44
|
+
- 1
|
45
|
+
version: 1.4.1
|
46
|
+
type: :runtime
|
47
|
+
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rspec
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
segments:
|
56
|
+
- 1
|
57
|
+
- 3
|
58
|
+
- 0
|
59
|
+
version: 1.3.0
|
60
|
+
type: :development
|
61
|
+
version_requirements: *id003
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
prerelease: false
|
65
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 0
|
71
|
+
- 5
|
72
|
+
- 4
|
73
|
+
version: 0.5.4
|
74
|
+
type: :development
|
75
|
+
version_requirements: *id004
|
76
|
+
description: eagleclaw is a small library to help build screen-scrapers
|
77
|
+
email: z@zacharyvoase.com
|
78
|
+
executables:
|
79
|
+
- eagleclaw
|
80
|
+
extensions: []
|
81
|
+
|
82
|
+
extra_rdoc_files: []
|
83
|
+
|
84
|
+
files:
|
85
|
+
- README.md
|
86
|
+
- lib/eagleclaw/browser.rb
|
87
|
+
- lib/eagleclaw/callbacks.rb
|
88
|
+
- lib/eagleclaw/constmatch.rb
|
89
|
+
- lib/eagleclaw/formatters/csv.rb
|
90
|
+
- lib/eagleclaw/formatters/json.rb
|
91
|
+
- lib/eagleclaw/formatters.rb
|
92
|
+
- lib/eagleclaw/runner.rb
|
93
|
+
- lib/eagleclaw/scrapers.rb
|
94
|
+
- lib/eagleclaw/string.rb
|
95
|
+
- lib/eagleclaw/xml.rb
|
96
|
+
- lib/eagleclaw.rb
|
97
|
+
has_rdoc: false
|
98
|
+
homepage: http://github.com/zacharyvoase/eagleclaw
|
99
|
+
licenses:
|
100
|
+
- Public Domain
|
101
|
+
post_install_message:
|
102
|
+
rdoc_options: []
|
103
|
+
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
segments:
|
111
|
+
- 1
|
112
|
+
- 8
|
113
|
+
- 6
|
114
|
+
version: 1.8.6
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirements: []
|
123
|
+
|
124
|
+
rubyforge_project:
|
125
|
+
rubygems_version: 1.3.6
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: A small screen-scraping library
|
129
|
+
test_files: []
|
130
|
+
|