imw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,306 @@
1
+
2
+
3
+
4
+ #
5
+ # h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
6
+ #
7
+ # == About
8
+ #
9
+ # This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
10
+ # abstract class and some concrete subclasses which perform specific
11
+ # kinds of matches against HTML documents using the
12
+ # Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
13
+ #
14
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
+ # Copyright:: Copyright (c) 2008 infochimps.org
16
+ # License:: GPL 3.0
17
+ # Website:: http://infinitemonkeywrench.org/
18
+ #
19
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
20
+
21
+ require 'imw/utils/extensions/hpricot'
22
+
23
+ module IMW
24
+ module HTMLParserMatcher
25
+
26
+ # An abstract class from which to subclass specific HTML matchers.
27
+ #
28
+ # A subclass is initialized with a +selector+ and an optional
29
+ # +matcher+. The +selector+ is an HTML path specification used to
30
+ # collect elements from the document. If initialized with a
31
+ # +matcher+, the +matcher+ is used to return match information
32
+ # from the elements; else the inner HTML is returned. Subclasses
33
+ # decide how the +selector+ will collect elements.
34
+ class Matcher
35
+
36
+ attr_accessor :selector
37
+ attr_accessor :matcher
38
+ attr_accessor :options
39
+
40
+ def initialize selector, matcher=nil, options={}
41
+ self.selector = selector
42
+ self.matcher = matcher
43
+ self.options = options
44
+ end
45
+
46
+ def match doc
47
+ raise "Abstract class #{self.class}"
48
+ end
49
+
50
+ end
51
+
52
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
53
+ # for matching against the first element of a document matching a
54
+ # selector.
55
+ class MatchFirstElement < Matcher
56
+ # Grab the first element from +doc+ matching the +selector+ this
57
+ # class was initialized with. If initialized with a +matcher+,
58
+ # then return the +matcher+'s match against the first element,
59
+ # else just return the inner HTML of the first element.
60
+ #
61
+ # m = MatchFirstElement.new('span#bio/a.homepage')
62
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
63
+ # # => 'My Homepage'
64
+ def match doc
65
+ doc = Hpricot(doc) if doc.is_a?(String)
66
+ el = doc.at(selector) or return nil
67
+ if matcher
68
+ matcher.match(el)
69
+ else
70
+ options[:html] ? el.inner_html : el.inner_text.strip
71
+ end
72
+ end
73
+ end
74
+
75
+ # FIXME is there really a need for this separate class? why can't
76
+ # MatchFirstElement.match accept a block?
77
+ class MatchProc < MatchFirstElement
78
+ attr_accessor :proc
79
+ attr_accessor :options
80
+ def initialize selector, proc, matcher=nil, options={}
81
+ super selector, matcher
82
+ self.options = options
83
+ self.proc = proc
84
+ end
85
+ def match doc
86
+ val = super doc
87
+ val ? self.proc.call(val) : self.proc.call(doc)
88
+ end
89
+ end
90
+
91
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
92
+ # for matching each element of a document matching a selector.
93
+ class MatchArray < Matcher
94
+ # Grab each element from +doc+ matching the +selector+ this
95
+ # class was initialized with. If initialized with a +matcher+,
96
+ # then return an array consisting of the +matcher+'s match
97
+ # against each element, else just return an array consisting of
98
+ # the inner HTML of each element.
99
+ #
100
+ # m = MatchArray.new('span#bio/a.homepage')
101
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
102
+ # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
103
+ # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
104
+ # # => ["My Homepage", "Your Homepage", "Their Homepage"]
105
+ def match doc
106
+ doc = Hpricot(doc) if doc.is_a?(String)
107
+ subdoc = (doc/selector) or return nil
108
+ if matcher
109
+ subdoc.map{|el| matcher.match(el)}
110
+ else
111
+ if options[:html]
112
+ subdoc.map{|el| el.inner_html }
113
+ else
114
+ subdoc.map{|el| el.inner_text.strip }
115
+ end
116
+ end
117
+ end
118
+ end
119
+
120
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
121
+ # for matching an attribute of the first element of a document
122
+ # matching a selector.
123
+ class MatchAttribute < Matcher
124
+
125
+ attr_accessor :attribute
126
+
127
+ # Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
128
+ # <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
129
+ # with three arguments: the +selector+ which collects elements
130
+ # from an HTML document, an +attribute+ to extract, and
131
+ # (optionally) a +matcher+ to perform the matching.
132
+ def initialize selector, attribute, matcher=nil
133
+ super selector, matcher
134
+ self.attribute = attribute.to_s
135
+ end
136
+
137
+ # Grab the first element from +doc+ matching the +selector+ this
138
+ # class was initialized with. If initialized with a +matcher+,
139
+ # then return the +matcher+'s match against the value of the
140
+ # +attribute+ this class was initialized with, else just return
141
+ # the value of the +attribute+.
142
+ #
143
+ # m = MatchAttribute.new('span#bio/a.homepage', 'href')
144
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
145
+ # # => 'http://foo.bar'
146
+ def match doc
147
+ doc = Hpricot(doc) if doc.is_a?(String)
148
+ val = doc.path_attr(selector, attribute)
149
+ matcher ? matcher.match(val) : val
150
+ end
151
+ end
152
+
153
+ # Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
154
+ # for using a regular expression to match against text in an HTML
155
+ # document.
156
+ class MatchRegexp < Matcher
157
+
158
+ attr_accessor :re
159
+ attr_accessor :options
160
+
161
+ # Use the regular expression +re+ to return captures from the
162
+ # elements collected by +selector+ (treated as text) used on an
163
+ # HTML document (if +selector+ is +nil+ then match against the
164
+ # full text of the document). If the keyword argument
165
+ # <tt>:capture</tt> is specified then return the corresponding
166
+ # group (indexing is that of regular expressions; "1" is the
167
+ # first capture), else return an array of all captures. If
168
+ # +matcher+, then use it on the capture(s) before returning.
169
+ #
170
+ # FIXME Shouldn't the matcher come BEFORE the regexp capture,
171
+ # not after?
172
+ def initialize selector, re, matcher=nil, options={}
173
+ super selector, matcher
174
+ self.options = options
175
+ self.re = re
176
+ end
177
+
178
+ # Grab the first element from +doc+ matching the +selector+ this
179
+ # object was initialized with. Use the +re+ and the (optional)
180
+ # capture group this object was initialized with to capture a
181
+ # string (or array of strings if no capture group was specified)
182
+ # from the collected element (treated as text). If initialized
183
+ # with a +matcher+, then return the +matcher+'s match against
184
+ # the value of the capture(s), else just return the capture(s).
185
+ #
186
+ # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
187
+ # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
188
+ # # => "John Chimpo"
189
+ def match doc
190
+ doc = Hpricot(doc) if doc.is_a?(String)
191
+ el = selector ? doc.contents_of(selector) : doc
192
+ m = re.match(el.to_s)
193
+ val = case
194
+ when m.nil? then nil
195
+ when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
196
+ else m.captures
197
+ end
198
+ # pass to matcher, if any
199
+ matcher ? matcher.match(val) : val
200
+ end
201
+ end
202
+
203
+
204
+ class MatchRegexpRepeatedly < Matcher
205
+ attr_accessor :re
206
+ def initialize selector, re, matcher=nil
207
+ super selector, matcher
208
+ self.re = re
209
+ end
210
+ def match doc
211
+ doc = Hpricot(doc) if doc.is_a?(String)
212
+ # apply selector, if any
213
+ el = selector ? doc.contents_of(selector) : doc
214
+ return unless el
215
+ # get all matches
216
+ val = el.to_s.scan(re)
217
+ # if there's only one capture group, flatten the array
218
+ val = val.flatten if val.first && val.first.length == 1
219
+ # pass to matcher, if any
220
+ matcher ? matcher.match(val) : val
221
+ end
222
+ end
223
+
224
+ # Class for building a hash of values by using appropriate
225
+ # matchers against an HTML document.
226
+ class MatchHash
227
+
228
+ attr_accessor :match_hash
229
+
230
+ # The +match_hash+ must be a +Hash+ of symbols matched to HTML
231
+ # matchers (subclasses of
232
+ # <tt>IMW::HTMLParserMatcher::Matcher</tt>).
233
+ def initialize match_hash
234
+ # Kludge? maybe.
235
+ raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
236
+ self.match_hash = match_hash
237
+ end
238
+
239
+ # Use the +match_hash+ this +MatchHash+ was initialized with to
240
+ # select elements from +doc+ and extract information from them:
241
+ #
242
+ # m = MatchHash.new({
243
+ # :name => MatchFirstElement.new('li/span.customer'),
244
+ # :order_status => MatchAttribute.new('li/ul[@status]','status'),
245
+ # :products => MatchArray.new('li/ul/li')
246
+ # })
247
+ # m.match('<li><span class="customer">John Chimpo</span>
248
+ # <ul status="shipped">
249
+ # <li>bananas</li>
250
+ # <li>mangos</li>
251
+ # <li>banangos</li>
252
+ # </ul></li>')
253
+ # # => {
254
+ # :name => "John Chimpo",
255
+ # :order_status => "shipped",
256
+ # :products => ["bananas", "mangos", "banangos"]
257
+ # }
258
+ def match doc
259
+ doc = Hpricot(doc) if doc.is_a?(String)
260
+ hsh = { }
261
+ match_hash.each do |attr, m|
262
+ val = m.match(doc)
263
+ case attr
264
+ when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
265
+ else hsh[attr] = val end
266
+ end
267
+ self.class.scrub!(hsh)
268
+ end
269
+
270
+ # kill off keys with nil values
271
+ def self.scrub! hsh
272
+ hsh # .reject{|k,v| v.nil? }
273
+ end
274
+ end
275
+
276
+ #
277
+ # construct the downstream part of a hash matcher
278
+ #
279
+ def self.build_match_hash spec_hash
280
+ hsh = { }
281
+ spec_hash.each do |attr, spec|
282
+ hsh[attr] = build_parse_tree(spec)
283
+ end
284
+ hsh
285
+ end
286
+
287
+ #
288
+ # recursively build a tree of matchers
289
+ #
290
+ def self.build_parse_tree spec
291
+ case spec
292
+ when nil then nil
293
+ when Matcher then spec
294
+ when Hash then MatchHash.new(build_match_hash(spec))
295
+ when Array then
296
+ return nil if spec.empty?
297
+ raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
298
+ MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
299
+ when String then MatchFirstElement.new(spec)
300
+ when Proc then MatchProc.new(nil, spec)
301
+ when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
302
+ else raise "Don't know how to parse #{spec.inspect}"
303
+ end
304
+ end
305
+ end
306
+ end
@@ -0,0 +1,87 @@
1
+ module IMW
2
+ module Parsers
3
+
4
+ # This is an abstract class for a line-oriented parser intended to
5
+ # read and emit lines sequentially from a file.
6
+ #
7
+ # To leverage the functionality of this class, subclass it and
8
+ # define a +parse_line+ method.
9
+ class LineParser
10
+
11
+ # The number of lines to skip on each file parsed.
12
+ attr_accessor :skip_first
13
+
14
+ # The class to parse each line into. The +new+ method of this
15
+ # class must accept a hash.
16
+ attr_accessor :klass
17
+
18
+ # If called with the option <tt>:skip_first</tt> then skip the
19
+ # corresponding number of lines at the beginning of the file when
20
+ # parsing.
21
+ def initialize options={}
22
+ @skip_first = options[:skip_first] || 0
23
+ @klass = options[:of] || options[:klass]
24
+ end
25
+
26
+ # Parse the given file. If the option <tt>:lines</tt> is passed
27
+ # in then only parse that many lines. If given a block then
28
+ # yield the result of each line to the block; else just return
29
+ # an array of results.
30
+ #
31
+ # If this parser has a +klass+ attribute then each parsed line
32
+ # will first be turned into an instance of that class (the class
33
+ # must accept a hash of values in its initializer).
34
+ def parse! file, options={}, &block
35
+ skip_lines!(file)
36
+ if options[:lines]
37
+ case
38
+ when klass && block_given?
39
+ options[:lines].times do
40
+ yield klass.new(parse_line(file.readline))
41
+ end
42
+ when block_given?
43
+ options[:lines].times do
44
+ yield parse_line(file.readline)
45
+ end
46
+ when klass
47
+ options[:lines].times do
48
+ klass.new(parse_line(file.readline))
49
+ end
50
+ else
51
+ options[:lines].times.map do
52
+ parse_line(file.readline)
53
+ end
54
+ end
55
+ else
56
+ case
57
+ when klass && block_given?
58
+ file.each do |line|
59
+ yield klass.new(parse_line(line))
60
+ end
61
+ when block_given?
62
+ file.each do |line|
63
+ yield parse_line(line)
64
+ end
65
+ when klass
66
+ file.map do |line|
67
+ klass.new(parse_line(line))
68
+ end
69
+ else
70
+ file.map do |line|
71
+ parse_line(line)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+ def parse_line line
78
+ raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
79
+ end
80
+
81
+ protected
82
+ def skip_lines! file
83
+ skip_first.times { file.readline }
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,72 @@
1
+ require 'imw/parsers/line_parser'
2
+
3
+ module IMW
4
+ module Parsers
5
+
6
+ # A RegexpParser is a line-oriented parser which uses a regular
7
+ # expression to extract data from a line into either a hash or an
8
+ # object obeying hash semantics.
9
+ #
10
+ # As an example, a flat file with one record per line in the
11
+ # following format (this is a simplified version of common
12
+ # webserver log formats)
13
+ #
14
+ # 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
15
+ # 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
16
+ # 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
17
+ # ...
18
+ #
19
+ # could be parsed as follows
20
+ #
21
+ # file = File.new '/path/to/file.log'
22
+ # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
23
+ # :into_fields => [:ip, :timestamp, :verb, :url, :version]
24
+ # parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
25
+ #
26
+ # Consecutive captures from the regular expression will be pushed
27
+ # into a hash with keys given by the +into_fields+ property of
28
+ # this parser.
29
+ #
30
+ # If the parser is instantiated with the <tt>:of</tt> keyword then
31
+ # the parsed hash from each line is used to instantiate a new
32
+ # object of the corresponding class:
33
+ #
34
+ # require 'ostruct'
35
+ #
36
+ # PageView = Class.new(OpenStruct)
37
+ #
38
+ # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
39
+ # :into_fields => [:ip, :timestamp, :verb, :url, :version],
40
+ # :of => PageView
41
+ #
42
+ # parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
43
+ #
44
+ # The option <tt>:strictly</tt> can also be set to force the
45
+ # parser to raise an error if it finds a line which doesn't match
46
+ # its regexp.
47
+ class RegexpParser < LineParser
48
+ attr_accessor :regexp, :fields, :strict
49
+
50
+ def initialize options={}
51
+ @regexp = options[:regexp] || options[:by_regexp]
52
+ @fields = options[:fields] || options[:into_fields]
53
+ @strict = options[:strict] || options[:strictly]
54
+ super options
55
+ end
56
+
57
+ def parse_line line
58
+ match_data = regexp.match(line.chomp)
59
+ returning({}) do |hsh|
60
+ if match_data
61
+ match_data.captures.each_with_index do |capture, index|
62
+ hsh[fields[index]] = capture
63
+ end
64
+ else
65
+ raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+