imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -26,6 +26,10 @@ module IMW
26
26
  add_inputs inputs
27
27
  end
28
28
 
29
+ # FIXME Instead of requiring +new_inputs+ to be either an Array
30
+ # or Hash just iterate through whatever it is using +each+ and
31
+ # see if the iterate can be interpreted as a mapping between
32
+ # strings.
29
33
  def add_inputs new_inputs
30
34
  @inputs ||= {}
31
35
  if new_inputs.is_a?(Array)
@@ -70,6 +74,10 @@ module IMW
70
74
  @dir ||= File.join(tmp_dir, name.to_s)
71
75
  end
72
76
 
77
+ # FIXME This needs to be made idempotent -- calling prepare
78
+ # twice should not do any work the second time (unless the user
79
+ # is insistent and passes a :force option -- or maybe use bang
80
+ # and not-bang versions of the method for this distinction).
73
81
  def prepare!
74
82
  FileUtils.mkdir_p dir unless File.exist?(dir)
75
83
  inputs.each_pair do |path, basename|
@@ -87,7 +95,17 @@ module IMW
87
95
  end
88
96
  end
89
97
  end
90
-
98
+
99
+ # Package the contents of the temporary directory to an archive
100
+ # at +output+ but return exceptions instead of raising them.
101
+ def package output, options={}
102
+ begin
103
+ package! output, options={}
104
+ rescue RuntimeError => e
105
+ return e
106
+ end
107
+ end
108
+
91
109
  # Package the contents of the temporary directory to an archive
92
110
  # at +output+.
93
111
  def package! output, options={}
@@ -19,6 +19,14 @@ module IMW
19
19
  last_response && last_response.response.class == Net::HTTPOK
20
20
  end
21
21
 
22
+ def upload local_path, remote_path
23
+ begin
24
+ upload! local_path, remote_path
25
+ rescue RuntimeError => e
26
+ return e
27
+ end
28
+ end
29
+
22
30
  def upload! local_path, remote_path
23
31
  @last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
24
32
  end
@@ -1,305 +1,288 @@
1
-
2
-
3
-
4
- #
5
- # h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
6
- #
7
- # == About
8
- #
9
- # This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
10
- # abstract class and some concrete subclasses which perform specific
11
- # kinds of matches against HTML documents using the
12
- # Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
13
- #
14
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
- # Copyright:: Copyright (c) 2008 infochimps.org
16
- # License:: GPL 3.0
17
- # Website:: http://infinitemonkeywrench.org/
18
- #
19
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
20
-
21
1
  require 'imw/utils/extensions/hpricot'
22
2
 
23
3
  module IMW
24
- module HTMLParserMatcher
4
+ module Parsers
5
+ module HtmlMatchers
25
6
 
26
- # An abstract class from which to subclass specific HTML matchers.
27
- #
28
- # A subclass is initialized with a +selector+ and an optional
29
- # +matcher+. The +selector+ is an HTML path specification used to
30
- # collect elements from the document. If initialized with a
31
- # +matcher+, the +matcher+ is used to return match information
32
- # from the elements; else the inner HTML is returned. Subclasses
33
- # decide how the +selector+ will collect elements.
34
- class Matcher
35
-
36
- attr_accessor :selector
37
- attr_accessor :matcher
38
- attr_accessor :options
39
-
40
- def initialize selector, matcher=nil, options={}
41
- self.selector = selector
42
- self.matcher = matcher
43
- self.options = options
44
- end
7
+ # An abstract class from which to subclass specific HTML matchers.
8
+ #
9
+ # A subclass is initialized with a +selector+ and an optional
10
+ # +matcher+. The +selector+ is an HTML path specification used to
11
+ # collect elements from the document. If initialized with a
12
+ # +matcher+, the +matcher+ is used to return match information
13
+ # from the elements; else the inner HTML is returned. Subclasses
14
+ # decide how the +selector+ will collect elements.
15
+ class Matcher
16
+
17
+ attr_accessor :selector
18
+ attr_accessor :matcher
19
+ attr_accessor :options
20
+
21
+ def initialize selector, matcher=nil, options={}
22
+ self.selector = selector
23
+ self.matcher = matcher
24
+ self.options = options
25
+ end
45
26
 
46
- def match doc
47
- raise "Abstract class #{self.class}"
27
+ def match doc
28
+ raise "Abstract class #{self.class}"
29
+ end
30
+
48
31
  end
49
-
50
- end
51
32
 
52
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
53
- # for matching against the first element of a document matching a
54
- # selector.
55
- class MatchFirstElement < Matcher
56
- # Grab the first element from +doc+ matching the +selector+ this
57
- # class was initialized with. If initialized with a +matcher+,
58
- # then return the +matcher+'s match against the first element,
59
- # else just return the inner HTML of the first element.
60
- #
61
- # m = MatchFirstElement.new('span#bio/a.homepage')
62
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
63
- # # => 'My Homepage'
64
- def match doc
65
- doc = Hpricot(doc) if doc.is_a?(String)
66
- el = doc.at(selector) or return nil
67
- if matcher
68
- matcher.match(el)
69
- else
70
- options[:html] ? el.inner_html : el.inner_text.strip
33
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
34
+ # for matching against the first element of a document matching a
35
+ # selector.
36
+ class MatchFirstElement < Matcher
37
+ # Grab the first element from +doc+ matching the +selector+ this
38
+ # class was initialized with. If initialized with a +matcher+,
39
+ # then return the +matcher+'s match against the first element,
40
+ # else just return the inner HTML of the first element.
41
+ #
42
+ # m = MatchFirstElement.new('span#bio/a.homepage')
43
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
44
+ # # => 'My Homepage'
45
+ def match doc
46
+ doc = Hpricot(doc) if doc.is_a?(String)
47
+ el = doc.at(selector) or return nil
48
+ if matcher
49
+ matcher.match(el)
50
+ else
51
+ options[:html] ? el : el.inner_text.strip
52
+ end
71
53
  end
72
54
  end
73
- end
74
55
 
75
- # FIXME is there really a need for this separate class? why can't
76
- # MatchFirstElement.match accept a block?
77
- class MatchProc < MatchFirstElement
78
- attr_accessor :proc
79
- attr_accessor :options
80
- def initialize selector, proc, matcher=nil, options={}
81
- super selector, matcher
82
- self.options = options
83
- self.proc = proc
84
- end
85
- def match doc
86
- val = super doc
87
- val ? self.proc.call(val) : self.proc.call(doc)
88
- end
89
- end
56
+ # FIXME is there really a need for this separate class? why can't
57
+ # MatchFirstElement.match accept a block?
58
+ class MatchProc < MatchFirstElement
59
+ attr_accessor :proc
60
+ attr_accessor :options
61
+ def initialize selector, proc, matcher=nil, options={}
62
+ super selector, matcher
63
+ self.options = options
64
+ self.proc = proc
65
+ end
66
+ def match doc
67
+ val = super doc
68
+ val ? self.proc.call(val) : self.proc.call(doc)
69
+ end
70
+ end
90
71
 
91
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
92
- # for matching each element of a document matching a selector.
93
- class MatchArray < Matcher
94
- # Grab each element from +doc+ matching the +selector+ this
95
- # class was initialized with. If initialized with a +matcher+,
96
- # then return an array consisting of the +matcher+'s match
97
- # against each element, else just return an array consisting of
98
- # the inner HTML of each element.
99
- #
100
- # m = MatchArray.new('span#bio/a.homepage')
101
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
102
- # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
103
- # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
104
- # # => ["My Homepage", "Your Homepage", "Their Homepage"]
105
- def match doc
106
- doc = Hpricot(doc) if doc.is_a?(String)
107
- subdoc = (doc/selector) or return nil
108
- if matcher
109
- subdoc.map{|el| matcher.match(el)}
110
- else
111
- if options[:html]
112
- subdoc.map{|el| el.inner_html }
72
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
73
+ # for matching each element of a document matching a selector.
74
+ class MatchArray < Matcher
75
+ # Grab each element from +doc+ matching the +selector+ this
76
+ # class was initialized with. If initialized with a +matcher+,
77
+ # then return an array consisting of the +matcher+'s match
78
+ # against each element, else just return an array consisting of
79
+ # the inner HTML of each element.
80
+ #
81
+ # m = MatchArray.new('span#bio/a.homepage')
82
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
83
+ # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
84
+ # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
85
+ # # => ["My Homepage", "Your Homepage", "Their Homepage"]
86
+ def match doc
87
+ doc = Hpricot(doc) if doc.is_a?(String)
88
+ subdoc = (doc/selector) or return nil
89
+ if matcher
90
+ subdoc.map{|el| matcher.match(el)}
113
91
  else
114
- subdoc.map{|el| el.inner_text.strip }
92
+ if options[:html]
93
+ subdoc.map{|el| el }
94
+ else
95
+ subdoc.map{|el| el.inner_text.strip }
96
+ end
115
97
  end
116
98
  end
117
99
  end
118
- end
119
100
 
120
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
121
- # for matching an attribute of the first element of a document
122
- # matching a selector.
123
- class MatchAttribute < Matcher
101
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
102
+ # for matching an attribute of the first element of a document
103
+ # matching a selector.
104
+ class MatchAttribute < Matcher
124
105
 
125
- attr_accessor :attribute
106
+ attr_accessor :attribute
126
107
 
127
- # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
128
- # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
129
- # with three arguments: the +selector+ which collects elements
130
- # from an HTML document, an +attribute+ to extract, and
131
- # (optionally) a +matcher+ to perform the matching.
132
- def initialize selector, attribute, matcher=nil
133
- super selector, matcher
134
- self.attribute = attribute.to_s
135
- end
136
-
137
- # Grab the first element from +doc+ matching the +selector+ this
138
- # class was initialized with. If initialized with a +matcher+,
139
- # then return the +matcher+'s match against the value of the
140
- # +attribute+ this class was initialized with, else just return
141
- # the value of the +attribute+.
142
- #
143
- # m = MatchAttribute.new('span#bio/a.homepage', 'href')
144
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
145
- # # => 'http://foo.bar'
146
- def match doc
147
- doc = Hpricot(doc) if doc.is_a?(String)
148
- val = doc.path_attr(selector, attribute)
149
- matcher ? matcher.match(val) : val
108
+ # Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>,
109
+ # <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized
110
+ # with three arguments: the +selector+ which collects elements
111
+ # from an HTML document, an +attribute+ to extract, and
112
+ # (optionally) a +matcher+ to perform the matching.
113
+ def initialize selector, attribute, matcher=nil
114
+ super selector, matcher
115
+ self.attribute = attribute.to_s
116
+ end
117
+
118
+ # Grab the first element from +doc+ matching the +selector+ this
119
+ # class was initialized with. If initialized with a +matcher+,
120
+ # then return the +matcher+'s match against the value of the
121
+ # +attribute+ this class was initialized with, else just return
122
+ # the value of the +attribute+.
123
+ #
124
+ # m = MatchAttribute.new('span#bio/a.homepage', 'href')
125
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
126
+ # # => 'http://foo.bar'
127
+ def match doc
128
+ doc = Hpricot(doc) if doc.is_a?(String)
129
+ val = doc.path_attr(selector, attribute)
130
+ matcher ? matcher.match(val) : val
131
+ end
150
132
  end
151
- end
152
133
 
153
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
154
- # for using a regular expression to match against text in an HTML
155
- # document.
156
- class MatchRegexp < Matcher
157
-
158
- attr_accessor :re
159
- attr_accessor :options
134
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
135
+ # for using a regular expression to match against text in an HTML
136
+ # document.
137
+ class MatchRegexp < Matcher
138
+
139
+ attr_accessor :re
140
+ attr_accessor :options
160
141
 
161
- # Use the regular expression +re+ to return captures from the
162
- # elements collected by +selector+ (treated as text) used on an
163
- # HTML document (if +selector+ is +nil+ then match against the
164
- # full text of the document). If the keyword argument
165
- # <tt>:capture</tt> is specified then return the corresponding
166
- # group (indexing is that of regular expressions; "1" is the
167
- # first capture), else return an array of all captures. If
168
- # +matcher+, then use it on the capture(s) before returning.
169
- #
170
- # FIXME Shouldn't the matcher come BEFORE the regexp capture,
171
- # not after?
172
- def initialize selector, re, matcher=nil, options={}
173
- super selector, matcher
174
- self.options = options
175
- self.re = re
176
- end
142
+ # Use the regular expression +re+ to return captures from the
143
+ # elements collected by +selector+ (treated as text) used on an
144
+ # HTML document (if +selector+ is +nil+ then match against the
145
+ # full text of the document). If the keyword argument
146
+ # <tt>:capture</tt> is specified then return the corresponding
147
+ # group (indexing is that of regular expressions; "1" is the
148
+ # first capture), else return an array of all captures. If
149
+ # +matcher+, then use it on the capture(s) before returning.
150
+ #
151
+ # FIXME Shouldn't the matcher come BEFORE the regexp capture,
152
+ # not after?
153
+ def initialize selector, re, matcher=nil, options={}
154
+ super selector, matcher
155
+ self.options = options
156
+ self.re = re
157
+ end
177
158
 
178
- # Grab the first element from +doc+ matching the +selector+ this
179
- # object was initialized with. Use the +re+ and the (optional)
180
- # capture group this object was initialized with to capture a
181
- # string (or array of strings if no capture group was specified)
182
- # from the collected element (treated as text). If initialized
183
- # with a +matcher+, then return the +matcher+'s match against
184
- # the value of the capture(s), else just return the capture(s).
185
- #
186
- # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
187
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
188
- # # => "John Chimpo"
189
- def match doc
190
- doc = Hpricot(doc) if doc.is_a?(String)
191
- el = selector ? doc.contents_of(selector) : doc
192
- m = re.match(el.to_s)
193
- val = case
194
- when m.nil? then nil
195
- when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
196
- else m.captures
197
- end
198
- # pass to matcher, if any
199
- matcher ? matcher.match(val) : val
159
+ # Grab the first element from +doc+ matching the +selector+ this
160
+ # object was initialized with. Use the +re+ and the (optional)
161
+ # capture group this object was initialized with to capture a
162
+ # string (or array of strings if no capture group was specified)
163
+ # from the collected element (treated as text). If initialized
164
+ # with a +matcher+, then return the +matcher+'s match against
165
+ # the value of the capture(s), else just return the capture(s).
166
+ #
167
+ # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
168
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
169
+ # # => "John Chimpo"
170
+ def match doc
171
+ doc = Hpricot(doc) if doc.is_a?(String)
172
+ el = selector ? doc.contents_of(selector) : doc
173
+ m = re.match(el.to_s)
174
+ val = case
175
+ when m.nil? then nil
176
+ when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
177
+ else m.captures
178
+ end
179
+ # pass to matcher, if any
180
+ matcher ? matcher.match(val) : val
181
+ end
200
182
  end
201
- end
202
183
 
203
-
204
- class MatchRegexpRepeatedly < Matcher
205
- attr_accessor :re
206
- def initialize selector, re, matcher=nil
207
- super selector, matcher
208
- self.re = re
209
- end
210
- def match doc
211
- doc = Hpricot(doc) if doc.is_a?(String)
212
- # apply selector, if any
213
- el = selector ? doc.contents_of(selector) : doc
214
- return unless el
215
- # get all matches
216
- val = el.to_s.scan(re)
217
- # if there's only one capture group, flatten the array
218
- val = val.flatten if val.first && val.first.length == 1
219
- # pass to matcher, if any
220
- matcher ? matcher.match(val) : val
184
+
185
+ class MatchRegexpRepeatedly < Matcher
186
+ attr_accessor :re
187
+ def initialize selector, re, matcher=nil
188
+ super selector, matcher
189
+ self.re = re
190
+ end
191
+ def match doc
192
+ doc = Hpricot(doc) if doc.is_a?(String)
193
+ # apply selector, if any
194
+ el = selector ? doc.contents_of(selector) : doc
195
+ return unless el
196
+ # get all matches
197
+ val = el.to_s.scan(re)
198
+ # if there's only one capture group, flatten the array
199
+ val = val.flatten if val.first && val.first.length == 1
200
+ # pass to matcher, if any
201
+ matcher ? matcher.match(val) : val
202
+ end
221
203
  end
222
- end
223
-
224
- # Class for building a hash of values by using appropriate
225
- # matchers against an HTML document.
226
- class MatchHash
204
+
205
+ # Class for building a hash of values by using appropriate
206
+ # matchers against an HTML document.
207
+ class MatchHash
208
+
209
+ attr_accessor :match_hash
227
210
 
228
- attr_accessor :match_hash
211
+ # The +match_hash+ must be a +Hash+ of symbols matched to HTML
212
+ # matchers (subclasses of
213
+ # <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>).
214
+ def initialize match_hash
215
+ # Kludge? maybe.
216
+ raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
217
+ self.match_hash = match_hash
218
+ end
229
219
 
230
- # The +match_hash+ must be a +Hash+ of symbols matched to HTML
231
- # matchers (subclasses of
232
- # <tt>IMW::HTMLParserMatcher::Matcher</tt>).
233
- def initialize match_hash
234
- # Kludge? maybe.
235
- raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
236
- self.match_hash = match_hash
220
+ # Use the +match_hash+ this +MatchHash+ was initialized with to
221
+ # select elements from +doc+ and extract information from them:
222
+ #
223
+ # m = MatchHash.new({
224
+ # :name => MatchFirstElement.new('li/span.customer'),
225
+ # :order_status => MatchAttribute.new('li/ul[@status]','status'),
226
+ # :products => MatchArray.new('li/ul/li')
227
+ # })
228
+ # m.match('<li><span class="customer">John Chimpo</span>
229
+ # <ul status="shipped">
230
+ # <li>bananas</li>
231
+ # <li>mangos</li>
232
+ # <li>banangos</li>
233
+ # </ul></li>')
234
+ # # => {
235
+ # :name => "John Chimpo",
236
+ # :order_status => "shipped",
237
+ # :products => ["bananas", "mangos", "banangos"]
238
+ # }
239
+ def match doc
240
+ doc = Hpricot(doc) if doc.is_a?(String)
241
+ hsh = { }
242
+ match_hash.each do |attr, m|
243
+ val = m.match(doc)
244
+ case attr
245
+ when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
246
+ else hsh[attr] = val end
247
+ end
248
+ self.class.scrub!(hsh)
249
+ end
250
+
251
+ # kill off keys with nil values
252
+ def self.scrub! hsh
253
+ hsh # .reject{|k,v| v.nil? }
254
+ end
237
255
  end
238
256
 
239
- # Use the +match_hash+ this +MatchHash+ was initialized with to
240
- # select elements from +doc+ and extract information from them:
241
257
  #
242
- # m = MatchHash.new({
243
- # :name => MatchFirstElement.new('li/span.customer'),
244
- # :order_status => MatchAttribute.new('li/ul[@status]','status'),
245
- # :products => MatchArray.new('li/ul/li')
246
- # })
247
- # m.match('<li><span class="customer">John Chimpo</span>
248
- # <ul status="shipped">
249
- # <li>bananas</li>
250
- # <li>mangos</li>
251
- # <li>banangos</li>
252
- # </ul></li>')
253
- # # => {
254
- # :name => "John Chimpo",
255
- # :order_status => "shipped",
256
- # :products => ["bananas", "mangos", "banangos"]
257
- # }
258
- def match doc
259
- doc = Hpricot(doc) if doc.is_a?(String)
258
+ # construct the downstream part of a hash matcher
259
+ #
260
+ def self.build_match_hash spec_hash
260
261
  hsh = { }
261
- match_hash.each do |attr, m|
262
- val = m.match(doc)
263
- case attr
264
- when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
265
- else hsh[attr] = val end
262
+ spec_hash.each do |attr, spec|
263
+ hsh[attr] = build_parse_tree(spec)
266
264
  end
267
- self.class.scrub!(hsh)
268
- end
269
-
270
- # kill off keys with nil values
271
- def self.scrub! hsh
272
- hsh # .reject{|k,v| v.nil? }
265
+ hsh
273
266
  end
274
- end
275
-
276
- #
277
- # construct the downstream part of a hash matcher
278
- #
279
- def self.build_match_hash spec_hash
280
- hsh = { }
281
- spec_hash.each do |attr, spec|
282
- hsh[attr] = build_parse_tree(spec)
283
- end
284
- hsh
285
- end
286
267
 
287
- #
288
- # recursively build a tree of matchers
289
- #
290
- def self.build_parse_tree spec
291
- case spec
292
- when nil then nil
293
- when Matcher then spec
294
- when Hash then MatchHash.new(build_match_hash(spec))
295
- when Array then
296
- return nil if spec.empty?
297
- raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
298
- MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
299
- when String then MatchFirstElement.new(spec)
300
- when Proc then MatchProc.new(nil, spec)
301
- when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
302
- else raise "Don't know how to parse #{spec.inspect}"
268
+ #
269
+ # recursively build a tree of matchers
270
+ #
271
+ def self.build_parse_tree spec
272
+ case spec
273
+ when nil then nil
274
+ when Matcher then spec
275
+ when Hash then MatchHash.new(build_match_hash(spec))
276
+ when Array then
277
+ return nil if spec.empty?
278
+ raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
279
+ MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
280
+ when String then MatchFirstElement.new(spec)
281
+ when Proc then MatchProc.new(nil, spec)
282
+ when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
283
+ when Symbol then MatchAttribute.new(nil, spec, nil)
284
+ else raise "Don't know how to parse #{spec.inspect}"
285
+ end
303
286
  end
304
287
  end
305
288
  end