imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -26,6 +26,10 @@ module IMW
26
26
  add_inputs inputs
27
27
  end
28
28
 
29
+ # FIXME Instead of requiring +new_inputs+ to be either an Array
30
+ # or Hash just iterate through whatever it is using +each+ and
31
+ # see if the iterate can be interpreted as a mapping between
32
+ # strings.
29
33
  def add_inputs new_inputs
30
34
  @inputs ||= {}
31
35
  if new_inputs.is_a?(Array)
@@ -70,6 +74,10 @@ module IMW
70
74
  @dir ||= File.join(tmp_dir, name.to_s)
71
75
  end
72
76
 
77
+ # FIXME This needs to be made idempotent -- calling prepare
78
+ # twice should not do any work the second time (unless the user
79
+ # is insistent and passes a :force option -- or maybe use bang
80
+ # and not-bang versions of the method for this distinction).
73
81
  def prepare!
74
82
  FileUtils.mkdir_p dir unless File.exist?(dir)
75
83
  inputs.each_pair do |path, basename|
@@ -87,7 +95,17 @@ module IMW
87
95
  end
88
96
  end
89
97
  end
90
-
98
+
99
+ # Package the contents of the temporary directory to an archive
100
+ # at +output+ but return exceptions instead of raising them.
101
+ def package output, options={}
102
+ begin
103
+ package! output, options={}
104
+ rescue RuntimeError => e
105
+ return e
106
+ end
107
+ end
108
+
91
109
  # Package the contents of the temporary directory to an archive
92
110
  # at +output+.
93
111
  def package! output, options={}
@@ -19,6 +19,14 @@ module IMW
19
19
  last_response && last_response.response.class == Net::HTTPOK
20
20
  end
21
21
 
22
+ def upload local_path, remote_path
23
+ begin
24
+ upload! local_path, remote_path
25
+ rescue RuntimeError => e
26
+ return e
27
+ end
28
+ end
29
+
22
30
  def upload! local_path, remote_path
23
31
  @last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
24
32
  end
@@ -1,305 +1,288 @@
1
-
2
-
3
-
4
- #
5
- # h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
6
- #
7
- # == About
8
- #
9
- # This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
10
- # abstract class and some concrete subclasses which perform specific
11
- # kinds of matches against HTML documents using the
12
- # Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
13
- #
14
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
- # Copyright:: Copyright (c) 2008 infochimps.org
16
- # License:: GPL 3.0
17
- # Website:: http://infinitemonkeywrench.org/
18
- #
19
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
20
-
21
1
  require 'imw/utils/extensions/hpricot'
22
2
 
23
3
  module IMW
24
- module HTMLParserMatcher
4
+ module Parsers
5
+ module HtmlMatchers
25
6
 
26
- # An abstract class from which to subclass specific HTML matchers.
27
- #
28
- # A subclass is initialized with a +selector+ and an optional
29
- # +matcher+. The +selector+ is an HTML path specification used to
30
- # collect elements from the document. If initialized with a
31
- # +matcher+, the +matcher+ is used to return match information
32
- # from the elements; else the inner HTML is returned. Subclasses
33
- # decide how the +selector+ will collect elements.
34
- class Matcher
35
-
36
- attr_accessor :selector
37
- attr_accessor :matcher
38
- attr_accessor :options
39
-
40
- def initialize selector, matcher=nil, options={}
41
- self.selector = selector
42
- self.matcher = matcher
43
- self.options = options
44
- end
7
+ # An abstract class from which to subclass specific HTML matchers.
8
+ #
9
+ # A subclass is initialized with a +selector+ and an optional
10
+ # +matcher+. The +selector+ is an HTML path specification used to
11
+ # collect elements from the document. If initialized with a
12
+ # +matcher+, the +matcher+ is used to return match information
13
+ # from the elements; else the inner HTML is returned. Subclasses
14
+ # decide how the +selector+ will collect elements.
15
+ class Matcher
16
+
17
+ attr_accessor :selector
18
+ attr_accessor :matcher
19
+ attr_accessor :options
20
+
21
+ def initialize selector, matcher=nil, options={}
22
+ self.selector = selector
23
+ self.matcher = matcher
24
+ self.options = options
25
+ end
45
26
 
46
- def match doc
47
- raise "Abstract class #{self.class}"
27
+ def match doc
28
+ raise "Abstract class #{self.class}"
29
+ end
30
+
48
31
  end
49
-
50
- end
51
32
 
52
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
53
- # for matching against the first element of a document matching a
54
- # selector.
55
- class MatchFirstElement < Matcher
56
- # Grab the first element from +doc+ matching the +selector+ this
57
- # class was initialized with. If initialized with a +matcher+,
58
- # then return the +matcher+'s match against the first element,
59
- # else just return the inner HTML of the first element.
60
- #
61
- # m = MatchFirstElement.new('span#bio/a.homepage')
62
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
63
- # # => 'My Homepage'
64
- def match doc
65
- doc = Hpricot(doc) if doc.is_a?(String)
66
- el = doc.at(selector) or return nil
67
- if matcher
68
- matcher.match(el)
69
- else
70
- options[:html] ? el.inner_html : el.inner_text.strip
33
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
34
+ # for matching against the first element of a document matching a
35
+ # selector.
36
+ class MatchFirstElement < Matcher
37
+ # Grab the first element from +doc+ matching the +selector+ this
38
+ # class was initialized with. If initialized with a +matcher+,
39
+ # then return the +matcher+'s match against the first element,
40
+ # else just return the inner HTML of the first element.
41
+ #
42
+ # m = MatchFirstElement.new('span#bio/a.homepage')
43
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
44
+ # # => 'My Homepage'
45
+ def match doc
46
+ doc = Hpricot(doc) if doc.is_a?(String)
47
+ el = doc.at(selector) or return nil
48
+ if matcher
49
+ matcher.match(el)
50
+ else
51
+ options[:html] ? el : el.inner_text.strip
52
+ end
71
53
  end
72
54
  end
73
- end
74
55
 
75
- # FIXME is there really a need for this separate class? why can't
76
- # MatchFirstElement.match accept a block?
77
- class MatchProc < MatchFirstElement
78
- attr_accessor :proc
79
- attr_accessor :options
80
- def initialize selector, proc, matcher=nil, options={}
81
- super selector, matcher
82
- self.options = options
83
- self.proc = proc
84
- end
85
- def match doc
86
- val = super doc
87
- val ? self.proc.call(val) : self.proc.call(doc)
88
- end
89
- end
56
+ # FIXME is there really a need for this separate class? why can't
57
+ # MatchFirstElement.match accept a block?
58
+ class MatchProc < MatchFirstElement
59
+ attr_accessor :proc
60
+ attr_accessor :options
61
+ def initialize selector, proc, matcher=nil, options={}
62
+ super selector, matcher
63
+ self.options = options
64
+ self.proc = proc
65
+ end
66
+ def match doc
67
+ val = super doc
68
+ val ? self.proc.call(val) : self.proc.call(doc)
69
+ end
70
+ end
90
71
 
91
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
92
- # for matching each element of a document matching a selector.
93
- class MatchArray < Matcher
94
- # Grab each element from +doc+ matching the +selector+ this
95
- # class was initialized with. If initialized with a +matcher+,
96
- # then return an array consisting of the +matcher+'s match
97
- # against each element, else just return an array consisting of
98
- # the inner HTML of each element.
99
- #
100
- # m = MatchArray.new('span#bio/a.homepage')
101
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
102
- # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
103
- # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
104
- # # => ["My Homepage", "Your Homepage", "Their Homepage"]
105
- def match doc
106
- doc = Hpricot(doc) if doc.is_a?(String)
107
- subdoc = (doc/selector) or return nil
108
- if matcher
109
- subdoc.map{|el| matcher.match(el)}
110
- else
111
- if options[:html]
112
- subdoc.map{|el| el.inner_html }
72
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
73
+ # for matching each element of a document matching a selector.
74
+ class MatchArray < Matcher
75
+ # Grab each element from +doc+ matching the +selector+ this
76
+ # class was initialized with. If initialized with a +matcher+,
77
+ # then return an array consisting of the +matcher+'s match
78
+ # against each element, else just return an array consisting of
79
+ # the inner HTML of each element.
80
+ #
81
+ # m = MatchArray.new('span#bio/a.homepage')
82
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
83
+ # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
84
+ # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
85
+ # # => ["My Homepage", "Your Homepage", "Their Homepage"]
86
+ def match doc
87
+ doc = Hpricot(doc) if doc.is_a?(String)
88
+ subdoc = (doc/selector) or return nil
89
+ if matcher
90
+ subdoc.map{|el| matcher.match(el)}
113
91
  else
114
- subdoc.map{|el| el.inner_text.strip }
92
+ if options[:html]
93
+ subdoc.map{|el| el }
94
+ else
95
+ subdoc.map{|el| el.inner_text.strip }
96
+ end
115
97
  end
116
98
  end
117
99
  end
118
- end
119
100
 
120
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
121
- # for matching an attribute of the first element of a document
122
- # matching a selector.
123
- class MatchAttribute < Matcher
101
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
102
+ # for matching an attribute of the first element of a document
103
+ # matching a selector.
104
+ class MatchAttribute < Matcher
124
105
 
125
- attr_accessor :attribute
106
+ attr_accessor :attribute
126
107
 
127
- # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
128
- # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
129
- # with three arguments: the +selector+ which collects elements
130
- # from an HTML document, an +attribute+ to extract, and
131
- # (optionally) a +matcher+ to perform the matching.
132
- def initialize selector, attribute, matcher=nil
133
- super selector, matcher
134
- self.attribute = attribute.to_s
135
- end
136
-
137
- # Grab the first element from +doc+ matching the +selector+ this
138
- # class was initialized with. If initialized with a +matcher+,
139
- # then return the +matcher+'s match against the value of the
140
- # +attribute+ this class was initialized with, else just return
141
- # the value of the +attribute+.
142
- #
143
- # m = MatchAttribute.new('span#bio/a.homepage', 'href')
144
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
145
- # # => 'http://foo.bar'
146
- def match doc
147
- doc = Hpricot(doc) if doc.is_a?(String)
148
- val = doc.path_attr(selector, attribute)
149
- matcher ? matcher.match(val) : val
108
+ # Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>,
109
+ # <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized
110
+ # with three arguments: the +selector+ which collects elements
111
+ # from an HTML document, an +attribute+ to extract, and
112
+ # (optionally) a +matcher+ to perform the matching.
113
+ def initialize selector, attribute, matcher=nil
114
+ super selector, matcher
115
+ self.attribute = attribute.to_s
116
+ end
117
+
118
+ # Grab the first element from +doc+ matching the +selector+ this
119
+ # class was initialized with. If initialized with a +matcher+,
120
+ # then return the +matcher+'s match against the value of the
121
+ # +attribute+ this class was initialized with, else just return
122
+ # the value of the +attribute+.
123
+ #
124
+ # m = MatchAttribute.new('span#bio/a.homepage', 'href')
125
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
126
+ # # => 'http://foo.bar'
127
+ def match doc
128
+ doc = Hpricot(doc) if doc.is_a?(String)
129
+ val = doc.path_attr(selector, attribute)
130
+ matcher ? matcher.match(val) : val
131
+ end
150
132
  end
151
- end
152
133
 
153
- # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
154
- # for using a regular expression to match against text in an HTML
155
- # document.
156
- class MatchRegexp < Matcher
157
-
158
- attr_accessor :re
159
- attr_accessor :options
134
+ # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
135
+ # for using a regular expression to match against text in an HTML
136
+ # document.
137
+ class MatchRegexp < Matcher
138
+
139
+ attr_accessor :re
140
+ attr_accessor :options
160
141
 
161
- # Use the regular expression +re+ to return captures from the
162
- # elements collected by +selector+ (treated as text) used on an
163
- # HTML document (if +selector+ is +nil+ then match against the
164
- # full text of the document). If the keyword argument
165
- # <tt>:capture</tt> is specified then return the corresponding
166
- # group (indexing is that of regular expressions; "1" is the
167
- # first capture), else return an array of all captures. If
168
- # +matcher+, then use it on the capture(s) before returning.
169
- #
170
- # FIXME Shouldn't the matcher come BEFORE the regexp capture,
171
- # not after?
172
- def initialize selector, re, matcher=nil, options={}
173
- super selector, matcher
174
- self.options = options
175
- self.re = re
176
- end
142
+ # Use the regular expression +re+ to return captures from the
143
+ # elements collected by +selector+ (treated as text) used on an
144
+ # HTML document (if +selector+ is +nil+ then match against the
145
+ # full text of the document). If the keyword argument
146
+ # <tt>:capture</tt> is specified then return the corresponding
147
+ # group (indexing is that of regular expressions; "1" is the
148
+ # first capture), else return an array of all captures. If
149
+ # +matcher+, then use it on the capture(s) before returning.
150
+ #
151
+ # FIXME Shouldn't the matcher come BEFORE the regexp capture,
152
+ # not after?
153
+ def initialize selector, re, matcher=nil, options={}
154
+ super selector, matcher
155
+ self.options = options
156
+ self.re = re
157
+ end
177
158
 
178
- # Grab the first element from +doc+ matching the +selector+ this
179
- # object was initialized with. Use the +re+ and the (optional)
180
- # capture group this object was initialized with to capture a
181
- # string (or array of strings if no capture group was specified)
182
- # from the collected element (treated as text). If initialized
183
- # with a +matcher+, then return the +matcher+'s match against
184
- # the value of the capture(s), else just return the capture(s).
185
- #
186
- # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
187
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
188
- # # => "John Chimpo"
189
- def match doc
190
- doc = Hpricot(doc) if doc.is_a?(String)
191
- el = selector ? doc.contents_of(selector) : doc
192
- m = re.match(el.to_s)
193
- val = case
194
- when m.nil? then nil
195
- when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
196
- else m.captures
197
- end
198
- # pass to matcher, if any
199
- matcher ? matcher.match(val) : val
159
+ # Grab the first element from +doc+ matching the +selector+ this
160
+ # object was initialized with. Use the +re+ and the (optional)
161
+ # capture group this object was initialized with to capture a
162
+ # string (or array of strings if no capture group was specified)
163
+ # from the collected element (treated as text). If initialized
164
+ # with a +matcher+, then return the +matcher+'s match against
165
+ # the value of the capture(s), else just return the capture(s).
166
+ #
167
+ # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
168
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
169
+ # # => "John Chimpo"
170
+ def match doc
171
+ doc = Hpricot(doc) if doc.is_a?(String)
172
+ el = selector ? doc.contents_of(selector) : doc
173
+ m = re.match(el.to_s)
174
+ val = case
175
+ when m.nil? then nil
176
+ when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
177
+ else m.captures
178
+ end
179
+ # pass to matcher, if any
180
+ matcher ? matcher.match(val) : val
181
+ end
200
182
  end
201
- end
202
183
 
203
-
204
- class MatchRegexpRepeatedly < Matcher
205
- attr_accessor :re
206
- def initialize selector, re, matcher=nil
207
- super selector, matcher
208
- self.re = re
209
- end
210
- def match doc
211
- doc = Hpricot(doc) if doc.is_a?(String)
212
- # apply selector, if any
213
- el = selector ? doc.contents_of(selector) : doc
214
- return unless el
215
- # get all matches
216
- val = el.to_s.scan(re)
217
- # if there's only one capture group, flatten the array
218
- val = val.flatten if val.first && val.first.length == 1
219
- # pass to matcher, if any
220
- matcher ? matcher.match(val) : val
184
+
185
+ class MatchRegexpRepeatedly < Matcher
186
+ attr_accessor :re
187
+ def initialize selector, re, matcher=nil
188
+ super selector, matcher
189
+ self.re = re
190
+ end
191
+ def match doc
192
+ doc = Hpricot(doc) if doc.is_a?(String)
193
+ # apply selector, if any
194
+ el = selector ? doc.contents_of(selector) : doc
195
+ return unless el
196
+ # get all matches
197
+ val = el.to_s.scan(re)
198
+ # if there's only one capture group, flatten the array
199
+ val = val.flatten if val.first && val.first.length == 1
200
+ # pass to matcher, if any
201
+ matcher ? matcher.match(val) : val
202
+ end
221
203
  end
222
- end
223
-
224
- # Class for building a hash of values by using appropriate
225
- # matchers against an HTML document.
226
- class MatchHash
204
+
205
+ # Class for building a hash of values by using appropriate
206
+ # matchers against an HTML document.
207
+ class MatchHash
208
+
209
+ attr_accessor :match_hash
227
210
 
228
- attr_accessor :match_hash
211
+ # The +match_hash+ must be a +Hash+ of symbols matched to HTML
212
+ # matchers (subclasses of
213
+ # <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>).
214
+ def initialize match_hash
215
+ # Kludge? maybe.
216
+ raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
217
+ self.match_hash = match_hash
218
+ end
229
219
 
230
- # The +match_hash+ must be a +Hash+ of symbols matched to HTML
231
- # matchers (subclasses of
232
- # <tt>IMW::HTMLParserMatcher::Matcher</tt>).
233
- def initialize match_hash
234
- # Kludge? maybe.
235
- raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
236
- self.match_hash = match_hash
220
+ # Use the +match_hash+ this +MatchHash+ was initialized with to
221
+ # select elements from +doc+ and extract information from them:
222
+ #
223
+ # m = MatchHash.new({
224
+ # :name => MatchFirstElement.new('li/span.customer'),
225
+ # :order_status => MatchAttribute.new('li/ul[@status]','status'),
226
+ # :products => MatchArray.new('li/ul/li')
227
+ # })
228
+ # m.match('<li><span class="customer">John Chimpo</span>
229
+ # <ul status="shipped">
230
+ # <li>bananas</li>
231
+ # <li>mangos</li>
232
+ # <li>banangos</li>
233
+ # </ul></li>')
234
+ # # => {
235
+ # :name => "John Chimpo",
236
+ # :order_status => "shipped",
237
+ # :products => ["bananas", "mangos", "banangos"]
238
+ # }
239
+ def match doc
240
+ doc = Hpricot(doc) if doc.is_a?(String)
241
+ hsh = { }
242
+ match_hash.each do |attr, m|
243
+ val = m.match(doc)
244
+ case attr
245
+ when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
246
+ else hsh[attr] = val end
247
+ end
248
+ self.class.scrub!(hsh)
249
+ end
250
+
251
+ # kill off keys with nil values
252
+ def self.scrub! hsh
253
+ hsh # .reject{|k,v| v.nil? }
254
+ end
237
255
  end
238
256
 
239
- # Use the +match_hash+ this +MatchHash+ was initialized with to
240
- # select elements from +doc+ and extract information from them:
241
257
  #
242
- # m = MatchHash.new({
243
- # :name => MatchFirstElement.new('li/span.customer'),
244
- # :order_status => MatchAttribute.new('li/ul[@status]','status'),
245
- # :products => MatchArray.new('li/ul/li')
246
- # })
247
- # m.match('<li><span class="customer">John Chimpo</span>
248
- # <ul status="shipped">
249
- # <li>bananas</li>
250
- # <li>mangos</li>
251
- # <li>banangos</li>
252
- # </ul></li>')
253
- # # => {
254
- # :name => "John Chimpo",
255
- # :order_status => "shipped",
256
- # :products => ["bananas", "mangos", "banangos"]
257
- # }
258
- def match doc
259
- doc = Hpricot(doc) if doc.is_a?(String)
258
+ # construct the downstream part of a hash matcher
259
+ #
260
+ def self.build_match_hash spec_hash
260
261
  hsh = { }
261
- match_hash.each do |attr, m|
262
- val = m.match(doc)
263
- case attr
264
- when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
265
- else hsh[attr] = val end
262
+ spec_hash.each do |attr, spec|
263
+ hsh[attr] = build_parse_tree(spec)
266
264
  end
267
- self.class.scrub!(hsh)
268
- end
269
-
270
- # kill off keys with nil values
271
- def self.scrub! hsh
272
- hsh # .reject{|k,v| v.nil? }
265
+ hsh
273
266
  end
274
- end
275
-
276
- #
277
- # construct the downstream part of a hash matcher
278
- #
279
- def self.build_match_hash spec_hash
280
- hsh = { }
281
- spec_hash.each do |attr, spec|
282
- hsh[attr] = build_parse_tree(spec)
283
- end
284
- hsh
285
- end
286
267
 
287
- #
288
- # recursively build a tree of matchers
289
- #
290
- def self.build_parse_tree spec
291
- case spec
292
- when nil then nil
293
- when Matcher then spec
294
- when Hash then MatchHash.new(build_match_hash(spec))
295
- when Array then
296
- return nil if spec.empty?
297
- raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
298
- MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
299
- when String then MatchFirstElement.new(spec)
300
- when Proc then MatchProc.new(nil, spec)
301
- when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
302
- else raise "Don't know how to parse #{spec.inspect}"
268
+ #
269
+ # recursively build a tree of matchers
270
+ #
271
+ def self.build_parse_tree spec
272
+ case spec
273
+ when nil then nil
274
+ when Matcher then spec
275
+ when Hash then MatchHash.new(build_match_hash(spec))
276
+ when Array then
277
+ return nil if spec.empty?
278
+ raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
279
+ MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
280
+ when String then MatchFirstElement.new(spec)
281
+ when Proc then MatchProc.new(nil, spec)
282
+ when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
283
+ when Symbol then MatchAttribute.new(nil, spec, nil)
284
+ else raise "Don't know how to parse #{spec.inspect}"
285
+ end
303
286
  end
304
287
  end
305
288
  end