eagleclaw 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,4 @@
1
+ # EagleClaw
2
+
3
+ EagleClaw is a Ruby library for building screen scrapers. It’s very much a
4
+ work-in-progress.
data/bin/eagleclaw ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'eagleclaw/runner'
4
+
5
+ EagleClaw::Runner.new.run!
@@ -0,0 +1,24 @@
1
+ require 'mechanize'
2
+
3
+ module EagleClaw
4
+ ##
5
+ # Add browsing capabilities to a class.
6
+ module Browser
7
+ ##
8
+ # This browser's `Mechanize` agent.
9
+ #
10
+ # @return [Mechanize]
11
+ #
12
+ # @see http://mechanize.rubyforge.org/mechanize/
13
+ def agent
14
+ @agent ||= Mechanize.new
15
+ end
16
+
17
+ ##
18
+ # If a non-existent method is called, pass it along to the `Mechanize`
19
+ # agent.
20
+ def method_missing(name, *args, &block)
21
+ agent.send(name, *args, &block)
22
+ end
23
+ end # module Browser
24
+ end # module EagleClaw
@@ -0,0 +1,64 @@
1
+ module EagleClaw
2
+ module Callbacks
3
+ ##
4
+ # Register a callback for a given context.
5
+ #
6
+ # @param [Object] context
7
+ # an object (such as a symbol or list of symbols) which refers to a
8
+ # certain callback context.
9
+ # @param [optional, Symbol] meth the name of the callback method.
10
+ #
11
+ # @overload register(context, :method_name)
12
+ # Register a method as a callback.
13
+ # @overload register(context, &block)
14
+ # Register a block as a callback.
15
+ #
16
+ # @example Registering a method as a callback
17
+ # class MyCLS
18
+ # include Callbacks
19
+ #
20
+ # register(:preprocessing, :setup_db)
21
+ #
22
+ # def setup_db
23
+ # @database = DB.connect("username", "password")
24
+ # end
25
+ # end
26
+ #
27
+ # @example Registering a block as a callback
28
+ # class MyCLS
29
+ # include Callbacks
30
+ #
31
+ # register :preprocessing do
32
+ # @database = DB.connect("username", "password")
33
+ # end
34
+ # end
35
+ #
36
+ def register(context, meth = nil, &block)
37
+ callback = block_given? ? block : meth
38
+ ((@callbacks ||= {})[context] ||= []) << callback
39
+ end
40
+
41
+ ##
42
+ # Run the callbacks for a given context.
43
+ #
44
+ # @param context
45
+ # @param [optional, Object] recipient the object to run methods/procs on.
46
+ # @return [nil]
47
+ def run_callbacks(context, recipient = self)
48
+ (@callbacks[context] || []).each { |callback| run_proc(callback, recipient) }
49
+ nil
50
+ end
51
+
52
+ private
53
+
54
+ ##
55
+ # Run a given method or `Proc` on an instance.
56
+ def run_proc(meth, recipient = self)
57
+ if meth.is_a? Symbol
58
+ recipient.send(meth)
59
+ else
60
+ recipient.instance_eval(&meth)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,30 @@
1
+ module EagleClaw
2
+ ##
3
+ # Mixin to allow an object to act as a case-insensitive hash against its own
4
+ # constants. This should be used with singleton metaclasses, like so:
5
+ #
6
+ # class MyClass
7
+ # class << self
8
+ # include EagleClaw::ConstantMatcher
9
+ # end
10
+ # end
11
+ #
12
+ module ConstantMatcher
13
+ ##
14
+ # Retrieve the constant for the given symbol. Matching will be performed
15
+ # case-insensitively.
16
+ #
17
+ # @param [Symbol] constant
18
+ # A symbol referring to the constant to get.
19
+ #
20
+ # @example Get the JSON EagleClaw formatter
21
+ # EagleClaw::Formatters[:json] => EagleClaw::Formatters::JSON
22
+ #
23
+ def [](constant)
24
+ constants.each do |const|
25
+ return const_get(const) if const.downcase == constant.to_s.downcase
26
+ end
27
+ raise NameError, "Constant #{constant.inspect} not found"
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ require 'csv'
2
+
3
+ module EagleClaw::Formatters
4
+ module CSV
5
+ include ::EagleClaw::Formatter
6
+
7
+ def self.format(data, problems)
8
+ keys = data.first.keys
9
+ s = StringIO.new
10
+ ::CSV::Writer.generate(s) do |out|
11
+ out << keys
12
+ data.each do |datum|
13
+ out << keys.map(&datum.method(:[]))
14
+ end
15
+ end
16
+ s.string
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ gem 'json', '>= 1.4.3'
3
+ require 'json'
4
+
5
+ module EagleClaw::Formatters
6
+ module JSON
7
+ include ::EagleClaw::Formatter
8
+
9
+ def self.format(data, problems)
10
+ {"data" => data, "problems" => problems}.to_json
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ require 'eagleclaw/constmatch'
2
+
3
+ module EagleClaw
4
+ ##
5
+ # This is just a placeholder at the moment; you should include it in your
6
+ # formatter class in case it gets some useful methods in the future.
7
+ module Formatter
8
+ end
9
+
10
+ module Formatters
11
+ class << self
12
+ include EagleClaw::ConstantMatcher
13
+ end
14
+
15
+ autoload :JSON, 'eagleclaw/formatters/json'
16
+ autoload :CSV, 'eagleclaw/formatters/csv'
17
+ end
18
+ end
@@ -0,0 +1,50 @@
1
+ require 'optparse'
2
+ require 'ostruct'
3
+
4
+ module EagleClaw
5
+ class Runner
6
+ attr_accessor :options
7
+
8
+ def initialize
9
+ @options = OpenStruct.new(:require => [], :format => :json)
10
+ end
11
+
12
+ def parser
13
+ @parser ||= OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{$0} [options] ScraperName"
15
+
16
+ # ---- MAIN ----
17
+
18
+ opts.on('-r', '--require LIBRARY',
19
+ "Require LIBRARY before loading scrapers") do |lib|
20
+ options.require << lib
21
+ end
22
+
23
+ opts.on('-o', '--output FORMAT',
24
+ "Output data in format FORMAT (default: JSON)") do |format|
25
+ options.format = format.downcase.to_sym
26
+ end
27
+
28
+ # ---- TAIL ----
29
+
30
+ opts.on_tail('-h', '--help', "Show this help message") { puts opts and exit }
31
+ end
32
+ end
33
+
34
+ def parse!(args = ARGV)
35
+ parser.parse!(args)
36
+ end
37
+
38
+ def run!(args = ARGV)
39
+ parse!(args)
40
+ @options.require.each { |lib| require(lib) }
41
+ formatter = EagleClaw::Formatters[@options.format]
42
+
43
+ scraper = EagleClaw::Scrapers[args.shift.downcase.to_sym].new
44
+ scraper.run
45
+ data, problems = scraper.data, scraper.problems
46
+ output = formatter.format(data, problems)
47
+ puts output
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,9 @@
1
+ require 'eagleclaw/constmatch'
2
+
3
+ module EagleClaw
4
+ module Scrapers
5
+ class << self
6
+ include EagleClaw::ConstantMatcher
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,15 @@
1
+ require 'digest/sha1'
2
+
3
+ class String
4
+ def starts_with?(string)
5
+ !! (self =~ Regexp.new('^' + Regexp.escape(string)))
6
+ end
7
+
8
+ def ends_with?(string)
9
+ !! (self =~ Regexp.new(Regexp.escape(string) + '$'))
10
+ end
11
+
12
+ def sha1
13
+ (Digest::SHA1.new << self).hexdigest
14
+ end
15
+ end
@@ -0,0 +1,38 @@
1
+ require 'nokogiri'
2
+
3
+ class Nokogiri::XML::Element
4
+
5
+ ##
6
+ # Keep consuming elements until the block returns `true`.
7
+ #
8
+ # @param [Symbol] method
9
+ # The method to call on the current element to get the next one. The default
10
+ # is to use `:next_element`. Use `:next` to include text elements in the
11
+ # iteration.
12
+ #
13
+ # @yield [Nokogiri::XML::Element] element
14
+ # The current element. If the block returns `true` (or any non-false value)
15
+ # then this is what the method will return.
16
+ def next_until(method = :next_element)
17
+ current = self
18
+ until yield(current)
19
+ current = current.send(method)
20
+ end
21
+ current
22
+ end
23
+
24
+ ##
25
+ # Keep consuming elements until the block returns `false`.
26
+ #
27
+ # The behaviour of this method is identical to {#next_until}, only it will
28
+ # keep iterating until the block yields a *false* value instead of a true one.
29
+ #
30
+ # @see #next_until
31
+ def next_while(method = :next_element)
32
+ current = self
33
+ while yield(current)
34
+ current = current.send(method)
35
+ end
36
+ current
37
+ end
38
+ end
data/lib/eagleclaw.rb ADDED
@@ -0,0 +1,139 @@
1
+ require 'eagleclaw/browser'
2
+ require 'eagleclaw/callbacks'
3
+ require 'eagleclaw/formatters'
4
+ require 'eagleclaw/scrapers'
5
+ require 'eagleclaw/string'
6
+ require 'eagleclaw/xml'
7
+
8
+ module EagleClaw
9
+ class Scraper
10
+ class << self
11
+ include ::EagleClaw::Callbacks
12
+
13
+ attr_accessor :properties
14
+
15
+ ##
16
+ # Define a pre-processor to run in a certain context.
17
+ #
18
+ # @param [Symbol] context either `:each` or `:all`.
19
+ # @param [optional, Symbol] meth name of method to call.
20
+ # @return [nil]
21
+ #
22
+ # @overload before(:each, :method_name)
23
+ # Run the given method before each component of the run.
24
+ # @overload before(:all, :method_name)
25
+ # Run the given method before the run itself.
26
+ # @overload before(:each, &block)
27
+ # Run the given block (using `instance_eval`) before each component of
28
+ # the run.
29
+ # @overload before(:all, &block)
30
+ # Run the given block (using `instance_eval`) before the run itself.
31
+ #
32
+ # @example Fetch a page before the run
33
+ # before(:all) do
34
+ # agent.get("http://google.com/")
35
+ # end
36
+ #
37
+ # @example Reset the page before each component of the run
38
+ # before(:each) do
39
+ # agent.get("http://google.com/")
40
+ # end
41
+ #
42
+ def before(context, meth = nil, &block)
43
+ register([:before, context], meth, &block)
44
+ end
45
+
46
+ ##
47
+ # Define a post-processor to run in a certain context.
48
+ #
49
+ # @param [Symbol] context either `:each` or `:all`.
50
+ # @param [optional, Symbol] meth name of method to call.
51
+ # @return [nil]
52
+ #
53
+ # @overload after(:each, :method_name)
54
+ # Run the given method after each component of the run.
55
+ # @overload after(:all, :method_name)
56
+ # Run the given method after the run itself.
57
+ # @overload after(:each, &block)
58
+ # Run the given block (using `instance_eval`) after each component of
59
+ # the run.
60
+ # @overload after(:all, &block)
61
+ # Run the given block (using `instance_eval`) after the entire run.
62
+ #
63
+ # @see before
64
+ #
65
+ def after(context, meth = nil, &block)
66
+ register([:after, context], meth, &block)
67
+ end
68
+
69
+ def prop(prop_name, meth = nil, &block)
70
+ (@properties ||= []) << prop_name.to_sym
71
+ register([:property, prop_name.to_sym], meth, &block)
72
+ end
73
+ end # class << self (Scraper)
74
+
75
+ include Browser
76
+
77
+ ##
78
+ # A `Hash` which holds data collected during a run.
79
+ #
80
+ # @see #initialize
81
+ # @see #reset
82
+ attr_accessor :data
83
+
84
+ ##
85
+ # An `Array` which collects
86
+ attr_accessor :problems
87
+
88
+ ##
89
+ # Create a new {Scraper} instance.
90
+ #
91
+ # By default, just sets {#data @data} and {#problems @problems} to empty
92
+ # `Array`s.
93
+ #
94
+ def initialize
95
+ @data = []
96
+ @problems = []
97
+ end
98
+
99
+ ##
100
+ # Reset this scraper instance's state.
101
+ #
102
+ # The default version of this method just clears {#data @data} and
103
+ # {#problems @problems}.
104
+ #
105
+ # @return [nil]
106
+ # @abstract Subclass and extend to reset the scraper state.
107
+ #
108
+ def reset
109
+ data.clear
110
+ problems.clear
111
+ end
112
+
113
+ ##
114
+ # Run the scraper.
115
+ #
116
+ # Operating procedure:
117
+ #
118
+ # 1. Run {Scraper.before before(:all)} blocks.
119
+ # 2. For each property (defined with {Scraper.prop prop(:prop_name)}):
120
+ # 1. Run {Scraper.before before(:each)} blocks.
121
+ # 2. Run the property itself.
122
+ # 3. Runs {Scraper.after after(:each)} blocks.
123
+ # 3. Runs {Scraper.after after(:all)} blocks.
124
+ # 4. Return {#data data}.
125
+ #
126
+ # @see #reset
127
+ # @see #data
128
+ def run
129
+ self.class.run_callbacks([:before, :all], self)
130
+ self.class.properties.each do |property|
131
+ self.class.run_callbacks([:before, :each], self)
132
+ self.class.run_callbacks([:property, property], self)
133
+ self.class.run_callbacks([:after, :each], self)
134
+ end
135
+ self.class.run_callbacks([:after, :all], self)
136
+ data
137
+ end
138
+ end # class Scraper
139
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: eagleclaw
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 1
9
+ version: 0.1.1
10
+ platform: ruby
11
+ authors:
12
+ - Zachary Voase
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-05-10 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: mechanize
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 0
30
+ - 0
31
+ version: 1.0.0
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: nokogiri
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 1
43
+ - 4
44
+ - 1
45
+ version: 1.4.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ segments:
56
+ - 1
57
+ - 3
58
+ - 0
59
+ version: 1.3.0
60
+ type: :development
61
+ version_requirements: *id003
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ prerelease: false
65
+ requirement: &id004 !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 0
71
+ - 5
72
+ - 4
73
+ version: 0.5.4
74
+ type: :development
75
+ version_requirements: *id004
76
+ description: eagleclaw is a small library to help build screen-scrapers
77
+ email: z@zacharyvoase.com
78
+ executables:
79
+ - eagleclaw
80
+ extensions: []
81
+
82
+ extra_rdoc_files: []
83
+
84
+ files:
85
+ - README.md
86
+ - lib/eagleclaw/browser.rb
87
+ - lib/eagleclaw/callbacks.rb
88
+ - lib/eagleclaw/constmatch.rb
89
+ - lib/eagleclaw/formatters/csv.rb
90
+ - lib/eagleclaw/formatters/json.rb
91
+ - lib/eagleclaw/formatters.rb
92
+ - lib/eagleclaw/runner.rb
93
+ - lib/eagleclaw/scrapers.rb
94
+ - lib/eagleclaw/string.rb
95
+ - lib/eagleclaw/xml.rb
96
+ - lib/eagleclaw.rb
97
+ has_rdoc: false
98
+ homepage: http://github.com/zacharyvoase/eagleclaw
99
+ licenses:
100
+ - Public Domain
101
+ post_install_message:
102
+ rdoc_options: []
103
+
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ segments:
111
+ - 1
112
+ - 8
113
+ - 6
114
+ version: 1.8.6
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project:
125
+ rubygems_version: 1.3.6
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: A small screen-scraping library
129
+ test_files: []
130
+