eagleclaw 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,4 @@
1
+ # EagleClaw
2
+
3
+ EagleClaw is a Ruby library for building screen scrapers. It’s very much a
4
+ work-in-progress.
data/bin/eagleclaw ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'eagleclaw/runner'
4
+
5
+ EagleClaw::Runner.new.run!
@@ -0,0 +1,24 @@
1
+ require 'mechanize'
2
+
3
+ module EagleClaw
4
+ ##
5
+ # Add browsing capabilities to a class.
6
+ module Browser
7
+ ##
8
+ # This browser's `Mechanize` agent.
9
+ #
10
+ # @return [Mechanize]
11
+ #
12
+ # @see http://mechanize.rubyforge.org/mechanize/
13
+ def agent
14
+ @agent ||= Mechanize.new
15
+ end
16
+
17
+ ##
18
+ # If a non-existent method is called, pass it along to the `Mechanize`
19
+ # agent.
20
+ def method_missing(name, *args, &block)
21
+ agent.send(name, *args, &block)
22
+ end
23
+ end # module Browser
24
+ end # module EagleClaw
@@ -0,0 +1,64 @@
1
+ module EagleClaw
2
+ module Callbacks
3
+ ##
4
+ # Register a callback for a given context.
5
+ #
6
+ # @param [Object] context
7
+ # an object (such as a symbol or list of symbols) which refers to a
8
+ # certain callback context.
9
+ # @param [optional, Symbol] meth the name of the callback method.
10
+ #
11
+ # @overload register(context, :method_name)
12
+ # Register a method as a callback.
13
+ # @overload register(context, &block)
14
+ # Register a block as a callback.
15
+ #
16
+ # @example Registering a method as a callback
17
+ # class MyCLS
18
+ # include Callbacks
19
+ #
20
+ # register(:preprocessing, :setup_db)
21
+ #
22
+ # def setup_db
23
+ # @database = DB.connect("username", "password")
24
+ # end
25
+ # end
26
+ #
27
+ # @example Registering a block as a callback
28
+ # class MyCLS
29
+ # include Callbacks
30
+ #
31
+ # register :preprocessing do
32
+ # @database = DB.connect("username", "password")
33
+ # end
34
+ # end
35
+ #
36
+ def register(context, meth = nil, &block)
37
+ callback = block_given? ? block : meth
38
+ ((@callbacks ||= {})[context] ||= []) << callback
39
+ end
40
+
41
+ ##
42
+ # Run the callbacks for a given context.
43
+ #
44
+ # @param context
45
+ # @param [optional, Object] recipient the object to run methods/procs on.
46
+ # @return [nil]
47
+ def run_callbacks(context, recipient = self)
48
+ (@callbacks[context] || []).each { |callback| run_proc(callback, recipient) }
49
+ nil
50
+ end
51
+
52
+ private
53
+
54
+ ##
55
+ # Run a given method or `Proc` on an instance.
56
+ def run_proc(meth, recipient = self)
57
+ if meth.is_a? Symbol
58
+ recipient.send(meth)
59
+ else
60
+ recipient.instance_eval(&meth)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,30 @@
1
+ module EagleClaw
2
+ ##
3
+ # Mixin to allow an object to act as a case-insensitive hash against its own
4
+ # constants. This should be used with singleton metaclasses, like so:
5
+ #
6
+ # class MyClass
7
+ # class << self
8
+ # include EagleClaw::ConstantMatcher
9
+ # end
10
+ # end
11
+ #
12
+ module ConstantMatcher
13
+ ##
14
+ # Retrieve the constant for the given symbol. Matching will be performed
15
+ # case-insensitively.
16
+ #
17
+ # @param [Symbol] constant
18
+ # A symbol referring to the constant to get.
19
+ #
20
+ # @example Get the JSON EagleClaw formatter
21
+ # EagleClaw::Formatters[:json] => EagleClaw::Formatters::JSON
22
+ #
23
+ def [](constant)
24
+ constants.each do |const|
25
+ return const_get(const) if const.downcase == constant.to_s.downcase
26
+ end
27
+ raise NameError, "Constant #{constant.inspect} not found"
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,19 @@
1
+ require 'csv'
2
+
3
+ module EagleClaw::Formatters
4
+ module CSV
5
+ include ::EagleClaw::Formatter
6
+
7
+ def self.format(data, problems)
8
+ keys = data.first.keys
9
+ s = StringIO.new
10
+ ::CSV::Writer.generate(s) do |out|
11
+ out << keys
12
+ data.each do |datum|
13
+ out << keys.map(&datum.method(:[]))
14
+ end
15
+ end
16
+ s.string
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ gem 'json', '>= 1.4.3'
3
+ require 'json'
4
+
5
+ module EagleClaw::Formatters
6
+ module JSON
7
+ include ::EagleClaw::Formatter
8
+
9
+ def self.format(data, problems)
10
+ {"data" => data, "problems" => problems}.to_json
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ require 'eagleclaw/constmatch'
2
+
3
+ module EagleClaw
4
+ ##
5
+ # This is just a placeholder at the moment; you should include it in your
6
+ # formatter class in case it gets some useful methods in the future.
7
+ module Formatter
8
+ end
9
+
10
+ module Formatters
11
+ class << self
12
+ include EagleClaw::ConstantMatcher
13
+ end
14
+
15
+ autoload :JSON, 'eagleclaw/formatters/json'
16
+ autoload :CSV, 'eagleclaw/formatters/csv'
17
+ end
18
+ end
@@ -0,0 +1,50 @@
1
+ require 'optparse'
2
+ require 'ostruct'
3
+
4
+ module EagleClaw
5
+ class Runner
6
+ attr_accessor :options
7
+
8
+ def initialize
9
+ @options = OpenStruct.new(:require => [], :format => :json)
10
+ end
11
+
12
+ def parser
13
+ @parser ||= OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{$0} [options] ScraperName"
15
+
16
+ # ---- MAIN ----
17
+
18
+ opts.on('-r', '--require LIBRARY',
19
+ "Require LIBRARY before loading scrapers") do |lib|
20
+ options.require << lib
21
+ end
22
+
23
+ opts.on('-o', '--output FORMAT',
24
+ "Output data in format FORMAT (default: JSON)") do |format|
25
+ options.format = format.downcase.to_sym
26
+ end
27
+
28
+ # ---- TAIL ----
29
+
30
+ opts.on_tail('-h', '--help', "Show this help message") { puts opts and exit }
31
+ end
32
+ end
33
+
34
+ def parse!(args = ARGV)
35
+ parser.parse!(args)
36
+ end
37
+
38
+ def run!(args = ARGV)
39
+ parse!(args)
40
+ @options.require.each { |lib| require(lib) }
41
+ formatter = EagleClaw::Formatters[@options.format]
42
+
43
+ scraper = EagleClaw::Scrapers[args.shift.downcase.to_sym].new
44
+ scraper.run
45
+ data, problems = scraper.data, scraper.problems
46
+ output = formatter.format(data, problems)
47
+ puts output
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,9 @@
1
+ require 'eagleclaw/constmatch'
2
+
3
+ module EagleClaw
4
+ module Scrapers
5
+ class << self
6
+ include EagleClaw::ConstantMatcher
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,15 @@
1
+ require 'digest/sha1'
2
+
3
+ class String
4
+ def starts_with?(string)
5
+ !! (self =~ Regexp.new('^' + Regexp.escape(string)))
6
+ end
7
+
8
+ def ends_with?(string)
9
+ !! (self =~ Regexp.new(Regexp.escape(string) + '$'))
10
+ end
11
+
12
+ def sha1
13
+ (Digest::SHA1.new << self).hexdigest
14
+ end
15
+ end
@@ -0,0 +1,38 @@
1
+ require 'nokogiri'
2
+
3
+ class Nokogiri::XML::Element
4
+
5
+ ##
6
+ # Keep consuming elements until the block returns `true`.
7
+ #
8
+ # @param [Symbol] method
9
+ # The method to call on the current element to get the next one. The default
10
+ # is to use `:next_element`. Use `:next` to include text elements in the
11
+ # iteration.
12
+ #
13
+ # @yield [Nokogiri::XML::Element] element
14
+ # The current element. If the block returns `true` (or any non-false value)
15
+ # then this is what the method will return.
16
+ def next_until(method = :next_element)
17
+ current = self
18
+ until yield(current)
19
+ current = current.send(method)
20
+ end
21
+ current
22
+ end
23
+
24
+ ##
25
+ # Keep consuming elements until the block returns `false`.
26
+ #
27
+ # The behaviour of this method is identical to {#next_until}, only it will
28
+ # keep iterating until the block yields a *false* value instead of a true one.
29
+ #
30
+ # @see #next_until
31
+ def next_while(method = :next_element)
32
+ current = self
33
+ while yield(current)
34
+ current = current.send(method)
35
+ end
36
+ current
37
+ end
38
+ end
data/lib/eagleclaw.rb ADDED
@@ -0,0 +1,139 @@
1
+ require 'eagleclaw/browser'
2
+ require 'eagleclaw/callbacks'
3
+ require 'eagleclaw/formatters'
4
+ require 'eagleclaw/scrapers'
5
+ require 'eagleclaw/string'
6
+ require 'eagleclaw/xml'
7
+
8
+ module EagleClaw
9
+ class Scraper
10
+ class << self
11
+ include ::EagleClaw::Callbacks
12
+
13
+ attr_accessor :properties
14
+
15
+ ##
16
+ # Define a pre-processor to run in a certain context.
17
+ #
18
+ # @param [Symbol] context either `:each` or `:all`.
19
+ # @param [optional, Symbol] meth name of method to call.
20
+ # @return [nil]
21
+ #
22
+ # @overload before(:each, :method_name)
23
+ # Run the given method before each component of the run.
24
+ # @overload before(:all, :method_name)
25
+ # Run the given method before the run itself.
26
+ # @overload before(:each, &block)
27
+ # Run the given block (using `instance_eval`) before each component of
28
+ # the run.
29
+ # @overload before(:all, &block)
30
+ # Run the given block (using `instance_eval`) before the run itself.
31
+ #
32
+ # @example Fetch a page before the run
33
+ # before(:all) do
34
+ # agent.get("http://google.com/")
35
+ # end
36
+ #
37
+ # @example Reset the page before each component of the run
38
+ # before(:each) do
39
+ # agent.get("http://google.com/")
40
+ # end
41
+ #
42
+ def before(context, meth = nil, &block)
43
+ register([:before, context], meth, &block)
44
+ end
45
+
46
+ ##
47
+ # Define a post-processor to run in a certain context.
48
+ #
49
+ # @param [Symbol] context either `:each` or `:all`.
50
+ # @param [optional, Symbol] meth name of method to call.
51
+ # @return [nil]
52
+ #
53
+ # @overload after(:each, :method_name)
54
+ # Run the given method after each component of the run.
55
+ # @overload after(:all, :method_name)
56
+ # Run the given method after the run itself.
57
+ # @overload after(:each, &block)
58
+ # Run the given block (using `instance_eval`) after each component of
59
+ # the run.
60
+ # @overload after(:all, &block)
61
+ # Run the given block (using `instance_eval`) after the entire run.
62
+ #
63
+ # @see before
64
+ #
65
+ def after(context, meth = nil, &block)
66
+ register([:after, context], meth, &block)
67
+ end
68
+
69
+ def prop(prop_name, meth = nil, &block)
70
+ (@properties ||= []) << prop_name.to_sym
71
+ register([:property, prop_name.to_sym], meth, &block)
72
+ end
73
+ end # class << self (Scraper)
74
+
75
+ include Browser
76
+
77
+ ##
78
+ # A `Hash` which holds data collected during a run.
79
+ #
80
+ # @see #initialize
81
+ # @see #reset
82
+ attr_accessor :data
83
+
84
+ ##
85
+ # An `Array` which collects
86
+ attr_accessor :problems
87
+
88
+ ##
89
+ # Create a new {Scraper} instance.
90
+ #
91
+ # By default, just sets {#data @data} and {#problems @problems} to empty
92
+ # `Array`s.
93
+ #
94
+ def initialize
95
+ @data = []
96
+ @problems = []
97
+ end
98
+
99
+ ##
100
+ # Reset this scraper instance's state.
101
+ #
102
+ # The default version of this method just clears {#data @data} and
103
+ # {#problems @problems}.
104
+ #
105
+ # @return [nil]
106
+ # @abstract Subclass and extend to reset the scraper state.
107
+ #
108
+ def reset
109
+ data.clear
110
+ problems.clear
111
+ end
112
+
113
+ ##
114
+ # Run the scraper.
115
+ #
116
+ # Operating procedure:
117
+ #
118
+ # 1. Run {Scraper.before before(:all)} blocks.
119
+ # 2. For each property (defined with {Scraper.prop prop(:prop_name)}):
120
+ # 1. Run {Scraper.before before(:each)} blocks.
121
+ # 2. Run the property itself.
122
+ # 3. Runs {Scraper.after after(:each)} blocks.
123
+ # 3. Runs {Scraper.after after(:all)} blocks.
124
+ # 4. Return {#data data}.
125
+ #
126
+ # @see #reset
127
+ # @see #data
128
+ def run
129
+ self.class.run_callbacks([:before, :all], self)
130
+ self.class.properties.each do |property|
131
+ self.class.run_callbacks([:before, :each], self)
132
+ self.class.run_callbacks([:property, property], self)
133
+ self.class.run_callbacks([:after, :each], self)
134
+ end
135
+ self.class.run_callbacks([:after, :all], self)
136
+ data
137
+ end
138
+ end # class Scraper
139
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: eagleclaw
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 1
9
+ version: 0.1.1
10
+ platform: ruby
11
+ authors:
12
+ - Zachary Voase
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-05-10 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: mechanize
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 0
30
+ - 0
31
+ version: 1.0.0
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: nokogiri
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 1
43
+ - 4
44
+ - 1
45
+ version: 1.4.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ segments:
56
+ - 1
57
+ - 3
58
+ - 0
59
+ version: 1.3.0
60
+ type: :development
61
+ version_requirements: *id003
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ prerelease: false
65
+ requirement: &id004 !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 0
71
+ - 5
72
+ - 4
73
+ version: 0.5.4
74
+ type: :development
75
+ version_requirements: *id004
76
+ description: eagleclaw is a small library to help build screen-scrapers
77
+ email: z@zacharyvoase.com
78
+ executables:
79
+ - eagleclaw
80
+ extensions: []
81
+
82
+ extra_rdoc_files: []
83
+
84
+ files:
85
+ - README.md
86
+ - lib/eagleclaw/browser.rb
87
+ - lib/eagleclaw/callbacks.rb
88
+ - lib/eagleclaw/constmatch.rb
89
+ - lib/eagleclaw/formatters/csv.rb
90
+ - lib/eagleclaw/formatters/json.rb
91
+ - lib/eagleclaw/formatters.rb
92
+ - lib/eagleclaw/runner.rb
93
+ - lib/eagleclaw/scrapers.rb
94
+ - lib/eagleclaw/string.rb
95
+ - lib/eagleclaw/xml.rb
96
+ - lib/eagleclaw.rb
97
+ has_rdoc: false
98
+ homepage: http://github.com/zacharyvoase/eagleclaw
99
+ licenses:
100
+ - Public Domain
101
+ post_install_message:
102
+ rdoc_options: []
103
+
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ segments:
111
+ - 1
112
+ - 8
113
+ - 6
114
+ version: 1.8.6
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project:
125
+ rubygems_version: 1.3.6
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: A small screen-scraping library
129
+ test_files: []
130
+