imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,306 @@
1
+
2
+
3
+
4
+ #
5
+ # h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
6
+ #
7
+ # == About
8
+ #
9
+ # This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
10
+ # abstract class and some concrete subclasses which perform specific
11
+ # kinds of matches against HTML documents using the
12
+ # Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
13
+ #
14
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
+ # Copyright:: Copyright (c) 2008 infochimps.org
16
+ # License:: GPL 3.0
17
+ # Website:: http://infinitemonkeywrench.org/
18
+ #
19
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
20
+
21
+ require 'imw/utils/extensions/hpricot'
22
+
23
+ module IMW
24
+ module HTMLParserMatcher
25
+
26
+ # An abstract class from which to subclass specific HTML matchers.
27
+ #
28
+ # A subclass is initialized with a +selector+ and an optional
29
+ # +matcher+. The +selector+ is an HTML path specification used to
30
+ # collect elements from the document. If initialized with a
31
+ # +matcher+, the +matcher+ is used to return match information
32
+ # from the elements; else the inner HTML is returned. Subclasses
33
+ # decide how the +selector+ will collect elements.
34
+ class Matcher
35
+
36
+ attr_accessor :selector
37
+ attr_accessor :matcher
38
+ attr_accessor :options
39
+
40
+ def initialize selector, matcher=nil, options={}
41
+ self.selector = selector
42
+ self.matcher = matcher
43
+ self.options = options
44
+ end
45
+
46
+ def match doc
47
+ raise "Abstract class #{self.class}"
48
+ end
49
+
50
+ end
51
+
52
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
53
+ # for matching against the first element of a document matching a
54
+ # selector.
55
+ class MatchFirstElement < Matcher
56
+ # Grab the first element from +doc+ matching the +selector+ this
57
+ # class was initialized with. If initialized with a +matcher+,
58
+ # then return the +matcher+'s match against the first element,
59
+ # else just return the inner HTML of the first element.
60
+ #
61
+ # m = MatchFirstElement.new('span#bio/a.homepage')
62
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
63
+ # # => 'My Homepage'
64
+ def match doc
65
+ doc = Hpricot(doc) if doc.is_a?(String)
66
+ el = doc.at(selector) or return nil
67
+ if matcher
68
+ matcher.match(el)
69
+ else
70
+ options[:html] ? el.inner_html : el.inner_text.strip
71
+ end
72
+ end
73
+ end
74
+
75
+ # FIXME is there really a need for this separate class? why can't
76
+ # MatchFirstElement.match accept a block?
77
+ class MatchProc < MatchFirstElement
78
+ attr_accessor :proc
79
+ attr_accessor :options
80
+ def initialize selector, proc, matcher=nil, options={}
81
+ super selector, matcher
82
+ self.options = options
83
+ self.proc = proc
84
+ end
85
+ def match doc
86
+ val = super doc
87
+ val ? self.proc.call(val) : self.proc.call(doc)
88
+ end
89
+ end
90
+
91
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
92
+ # for matching each element of a document matching a selector.
93
+ class MatchArray < Matcher
94
+ # Grab each element from +doc+ matching the +selector+ this
95
+ # class was initialized with. If initialized with a +matcher+,
96
+ # then return an array consisting of the +matcher+'s match
97
+ # against each element, else just return an array consisting of
98
+ # the inner HTML of each element.
99
+ #
100
+ # m = MatchArray.new('span#bio/a.homepage')
101
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
102
+ # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
103
+ # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
104
+ # # => ["My Homepage", "Your Homepage", "Their Homepage"]
105
+ def match doc
106
+ doc = Hpricot(doc) if doc.is_a?(String)
107
+ subdoc = (doc/selector) or return nil
108
+ if matcher
109
+ subdoc.map{|el| matcher.match(el)}
110
+ else
111
+ if options[:html]
112
+ subdoc.map{|el| el.inner_html }
113
+ else
114
+ subdoc.map{|el| el.inner_text.strip }
115
+ end
116
+ end
117
+ end
118
+ end
119
+
120
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
121
+ # for matching an attribute of the first element of a document
122
+ # matching a selector.
123
+ class MatchAttribute < Matcher
124
+
125
+ attr_accessor :attribute
126
+
127
+ # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
128
+ # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
129
+ # with three arguments: the +selector+ which collects elements
130
+ # from an HTML document, an +attribute+ to extract, and
131
+ # (optionally) a +matcher+ to perform the matching.
132
+ def initialize selector, attribute, matcher=nil
133
+ super selector, matcher
134
+ self.attribute = attribute.to_s
135
+ end
136
+
137
+ # Grab the first element from +doc+ matching the +selector+ this
138
+ # class was initialized with. If initialized with a +matcher+,
139
+ # then return the +matcher+'s match against the value of the
140
+ # +attribute+ this class was initialized with, else just return
141
+ # the value of the +attribute+.
142
+ #
143
+ # m = MatchAttribute.new('span#bio/a.homepage', 'href')
144
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
145
+ # # => 'http://foo.bar'
146
+ def match doc
147
+ doc = Hpricot(doc) if doc.is_a?(String)
148
+ val = doc.path_attr(selector, attribute)
149
+ matcher ? matcher.match(val) : val
150
+ end
151
+ end
152
+
153
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
154
+ # for using a regular expression to match against text in an HTML
155
+ # document.
156
+ class MatchRegexp < Matcher
157
+
158
+ attr_accessor :re
159
+ attr_accessor :options
160
+
161
+ # Use the regular expression +re+ to return captures from the
162
+ # elements collected by +selector+ (treated as text) used on an
163
+ # HTML document (if +selector+ is +nil+ then match against the
164
+ # full text of the document). If the keyword argument
165
+ # <tt>:capture</tt> is specified then return the corresponding
166
+ # group (indexing is that of regular expressions; "1" is the
167
+ # first capture), else return an array of all captures. If
168
+ # +matcher+, then use it on the capture(s) before returning.
169
+ #
170
+ # FIXME Shouldn't the matcher come BEFORE the regexp capture,
171
+ # not after?
172
+ def initialize selector, re, matcher=nil, options={}
173
+ super selector, matcher
174
+ self.options = options
175
+ self.re = re
176
+ end
177
+
178
+ # Grab the first element from +doc+ matching the +selector+ this
179
+ # object was initialized with. Use the +re+ and the (optional)
180
+ # capture group this object was initialized with to capture a
181
+ # string (or array of strings if no capture group was specified)
182
+ # from the collected element (treated as text). If initialized
183
+ # with a +matcher+, then return the +matcher+'s match against
184
+ # the value of the capture(s), else just return the capture(s).
185
+ #
186
+ # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
187
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
188
+ # # => "John Chimpo"
189
+ def match doc
190
+ doc = Hpricot(doc) if doc.is_a?(String)
191
+ el = selector ? doc.contents_of(selector) : doc
192
+ m = re.match(el.to_s)
193
+ val = case
194
+ when m.nil? then nil
195
+ when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
196
+ else m.captures
197
+ end
198
+ # pass to matcher, if any
199
+ matcher ? matcher.match(val) : val
200
+ end
201
+ end
202
+
203
+
204
+ class MatchRegexpRepeatedly < Matcher
205
+ attr_accessor :re
206
+ def initialize selector, re, matcher=nil
207
+ super selector, matcher
208
+ self.re = re
209
+ end
210
+ def match doc
211
+ doc = Hpricot(doc) if doc.is_a?(String)
212
+ # apply selector, if any
213
+ el = selector ? doc.contents_of(selector) : doc
214
+ return unless el
215
+ # get all matches
216
+ val = el.to_s.scan(re)
217
+ # if there's only one capture group, flatten the array
218
+ val = val.flatten if val.first && val.first.length == 1
219
+ # pass to matcher, if any
220
+ matcher ? matcher.match(val) : val
221
+ end
222
+ end
223
+
224
+ # Class for building a hash of values by using appropriate
225
+ # matchers against an HTML document.
226
+ class MatchHash
227
+
228
+ attr_accessor :match_hash
229
+
230
+ # The +match_hash+ must be a +Hash+ of symbols matched to HTML
231
+ # matchers (subclasses of
232
+ # <tt>IMW::HTMLParserMatcher::Matcher</tt>).
233
+ def initialize match_hash
234
+ # Kludge? maybe.
235
+ raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
236
+ self.match_hash = match_hash
237
+ end
238
+
239
+ # Use the +match_hash+ this +MatchHash+ was initialized with to
240
+ # select elements from +doc+ and extract information from them:
241
+ #
242
+ # m = MatchHash.new({
243
+ # :name => MatchFirstElement.new('li/span.customer'),
244
+ # :order_status => MatchAttribute.new('li/ul[@status]','status'),
245
+ # :products => MatchArray.new('li/ul/li')
246
+ # })
247
+ # m.match('<li><span class="customer">John Chimpo</span>
248
+ # <ul status="shipped">
249
+ # <li>bananas</li>
250
+ # <li>mangos</li>
251
+ # <li>banangos</li>
252
+ # </ul></li>')
253
+ # # => {
254
+ # :name => "John Chimpo",
255
+ # :order_status => "shipped",
256
+ # :products => ["bananas", "mangos", "banangos"]
257
+ # }
258
+ def match doc
259
+ doc = Hpricot(doc) if doc.is_a?(String)
260
+ hsh = { }
261
+ match_hash.each do |attr, m|
262
+ val = m.match(doc)
263
+ case attr
264
+ when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
265
+ else hsh[attr] = val end
266
+ end
267
+ self.class.scrub!(hsh)
268
+ end
269
+
270
+ # kill off keys with nil values
271
+ def self.scrub! hsh
272
+ hsh # .reject{|k,v| v.nil? }
273
+ end
274
+ end
275
+
276
+ #
277
+ # construct the downstream part of a hash matcher
278
+ #
279
+ def self.build_match_hash spec_hash
280
+ hsh = { }
281
+ spec_hash.each do |attr, spec|
282
+ hsh[attr] = build_parse_tree(spec)
283
+ end
284
+ hsh
285
+ end
286
+
287
+ #
288
+ # recursively build a tree of matchers
289
+ #
290
+ def self.build_parse_tree spec
291
+ case spec
292
+ when nil then nil
293
+ when Matcher then spec
294
+ when Hash then MatchHash.new(build_match_hash(spec))
295
+ when Array then
296
+ return nil if spec.empty?
297
+ raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
298
+ MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
299
+ when String then MatchFirstElement.new(spec)
300
+ when Proc then MatchProc.new(nil, spec)
301
+ when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
302
+ else raise "Don't know how to parse #{spec.inspect}"
303
+ end
304
+ end
305
+ end
306
+ end
@@ -0,0 +1,87 @@
1
+ module IMW
2
+ module Parsers
3
+
4
+ # This is an abstract class for a line-oriented parser intended to
5
+ # read and emit lines sequentially from a file.
6
+ #
7
+ # To leverage the functionality of this class, subclass it and
8
+ # define a +parse_line+ method.
9
+ class LineParser
10
+
11
+ # The number of lines to skip on each file parsed.
12
+ attr_accessor :skip_first
13
+
14
+ # The class to parse each line into. The +new+ method of this
15
+ # class must accept a hash.
16
+ attr_accessor :klass
17
+
18
+ # If called with the option <tt>:skip_first</tt> then skip the
19
+ # corresponding number of lines at the beginning of the file when
20
+ # parsing.
21
+ def initialize options={}
22
+ @skip_first = options[:skip_first] || 0
23
+ @klass = options[:of] || options[:klass]
24
+ end
25
+
26
+ # Parse the given file. If the option <tt>:lines</tt> is passed
27
+ # in then only parse that many lines. If given a block then
28
+ # yield the result of each line to the block; else just return
29
+ # an array of results.
30
+ #
31
+ # If this parser has a +klass+ attribute then each parsed line
32
+ # will first be turned into an instance of that class (the class
33
+ # must accept a hash of values in its initializer).
34
+ def parse! file, options={}, &block
35
+ skip_lines!(file)
36
+ if options[:lines]
37
+ case
38
+ when klass && block_given?
39
+ options[:lines].times do
40
+ yield klass.new(parse_line(file.readline))
41
+ end
42
+ when block_given?
43
+ options[:lines].times do
44
+ yield parse_line(file.readline)
45
+ end
46
+ when klass
47
+ options[:lines].times do
48
+ klass.new(parse_line(file.readline))
49
+ end
50
+ else
51
+ options[:lines].times.map do
52
+ parse_line(file.readline)
53
+ end
54
+ end
55
+ else
56
+ case
57
+ when klass && block_given?
58
+ file.each do |line|
59
+ yield klass.new(parse_line(line))
60
+ end
61
+ when block_given?
62
+ file.each do |line|
63
+ yield parse_line(line)
64
+ end
65
+ when klass
66
+ file.map do |line|
67
+ klass.new(parse_line(line))
68
+ end
69
+ else
70
+ file.map do |line|
71
+ parse_line(line)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ def parse_line line
78
+ raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
79
+ end
80
+
81
+ protected
82
+ def skip_lines! file
83
+ skip_first.times { file.readline }
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,72 @@
1
+ require 'imw/parsers/line_parser'
2
+
3
+ module IMW
4
+ module Parsers
5
+
6
+ # A RegexpParser is a line-oriented parser which uses a regular
7
+ # expression to extract data from a line into either a hash or an
8
+ # object obeying hash semantics.
9
+ #
10
+ # As an example, a flat file with one record per line in the
11
+ # following format (this is a simplified version of common
12
+ # webserver log formats)
13
+ #
14
+ # 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
15
+ # 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
16
+ # 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
17
+ # ...
18
+ #
19
+ # could be parsed as follows
20
+ #
21
+ # file = File.new '/path/to/file.log'
22
+ # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
23
+ # :into_fields => [:ip, :timestamp, :verb, :url, :version]
24
+ # parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
25
+ #
26
+ # Consecutive captures from the regular expression will be pushed
27
+ # into a hash with keys given by the +into_fields+ property of
28
+ # this parser.
29
+ #
30
+ # If the parser is instantiated with the <tt>:of</tt> keyword then
31
+ # the parsed hash from each line is used to instantiate a new
32
+ # object of the corresponding class:
33
+ #
34
+ # require 'ostruct'
35
+ #
36
+ # PageView = Class.new(OpenStruct)
37
+ #
38
+ # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
39
+ # :into_fields => [:ip, :timestamp, :verb, :url, :version],
40
+ # :of => PageView
41
+ #
42
+ # parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
43
+ #
44
+ # The option <tt>:strictly</tt> can also be set to force the
45
+ # parser to raise an error if it finds a line which doesn't match
46
+ # its regexp.
47
+ class RegexpParser < LineParser
48
+ attr_accessor :regexp, :fields, :strict
49
+
50
+ def initialize options={}
51
+ @regexp = options[:regexp] || options[:by_regexp]
52
+ @fields = options[:fields] || options[:into_fields]
53
+ @strict = options[:strict] || options[:strictly]
54
+ super options
55
+ end
56
+
57
+ def parse_line line
58
+ match_data = regexp.match(line.chomp)
59
+ returning({}) do |hsh|
60
+ if match_data
61
+ match_data.captures.each_with_index do |capture, index|
62
+ hsh[fields[index]] = capture
63
+ end
64
+ else
65
+ raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+