imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
|
4
|
+
#
|
5
|
+
# h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
|
6
|
+
#
|
7
|
+
# == About
|
8
|
+
#
|
9
|
+
# This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
10
|
+
# abstract class and some concrete subclasses which perform specific
|
11
|
+
# kinds of matches against HTML documents using the
|
12
|
+
# Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
|
13
|
+
#
|
14
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
15
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
16
|
+
# License:: GPL 3.0
|
17
|
+
# Website:: http://infinitemonkeywrench.org/
|
18
|
+
#
|
19
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
20
|
+
|
21
|
+
require 'imw/utils/extensions/hpricot'
|
22
|
+
|
23
|
+
module IMW
|
24
|
+
module HTMLParserMatcher
|
25
|
+
|
26
|
+
# An abstract class from which to subclass specific HTML matchers.
|
27
|
+
#
|
28
|
+
# A subclass is initialized with a +selector+ and an optional
|
29
|
+
# +matcher+. The +selector+ is an HTML path specification used to
|
30
|
+
# collect elements from the document. If initialized with a
|
31
|
+
# +matcher+, the +matcher+ is used to return match information
|
32
|
+
# from the elements; else the inner HTML is returned. Subclasses
|
33
|
+
# decide how the +selector+ will collect elements.
|
34
|
+
class Matcher
|
35
|
+
|
36
|
+
attr_accessor :selector
|
37
|
+
attr_accessor :matcher
|
38
|
+
attr_accessor :options
|
39
|
+
|
40
|
+
def initialize selector, matcher=nil, options={}
|
41
|
+
self.selector = selector
|
42
|
+
self.matcher = matcher
|
43
|
+
self.options = options
|
44
|
+
end
|
45
|
+
|
46
|
+
def match doc
|
47
|
+
raise "Abstract class #{self.class}"
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
53
|
+
# for matching against the first element of a document matching a
|
54
|
+
# selector.
|
55
|
+
class MatchFirstElement < Matcher
|
56
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
57
|
+
# class was initialized with. If initialized with a +matcher+,
|
58
|
+
# then return the +matcher+'s match against the first element,
|
59
|
+
# else just return the inner HTML of the first element.
|
60
|
+
#
|
61
|
+
# m = MatchFirstElement.new('span#bio/a.homepage')
|
62
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
63
|
+
# # => 'My Homepage'
|
64
|
+
def match doc
|
65
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
66
|
+
el = doc.at(selector) or return nil
|
67
|
+
if matcher
|
68
|
+
matcher.match(el)
|
69
|
+
else
|
70
|
+
options[:html] ? el.inner_html : el.inner_text.strip
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# FIXME is there really a need for this separate class? why can't
|
76
|
+
# MatchFirstElement.match accept a block?
|
77
|
+
class MatchProc < MatchFirstElement
|
78
|
+
attr_accessor :proc
|
79
|
+
attr_accessor :options
|
80
|
+
def initialize selector, proc, matcher=nil, options={}
|
81
|
+
super selector, matcher
|
82
|
+
self.options = options
|
83
|
+
self.proc = proc
|
84
|
+
end
|
85
|
+
def match doc
|
86
|
+
val = super doc
|
87
|
+
val ? self.proc.call(val) : self.proc.call(doc)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
92
|
+
# for matching each element of a document matching a selector.
|
93
|
+
class MatchArray < Matcher
|
94
|
+
# Grab each element from +doc+ matching the +selector+ this
|
95
|
+
# class was initialized with. If initialized with a +matcher+,
|
96
|
+
# then return an array consisting of the +matcher+'s match
|
97
|
+
# against each element, else just return an array consisting of
|
98
|
+
# the inner HTML of each element.
|
99
|
+
#
|
100
|
+
# m = MatchArray.new('span#bio/a.homepage')
|
101
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
|
102
|
+
# <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
|
103
|
+
# <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
|
104
|
+
# # => ["My Homepage", "Your Homepage", "Their Homepage"]
|
105
|
+
def match doc
|
106
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
107
|
+
subdoc = (doc/selector) or return nil
|
108
|
+
if matcher
|
109
|
+
subdoc.map{|el| matcher.match(el)}
|
110
|
+
else
|
111
|
+
if options[:html]
|
112
|
+
subdoc.map{|el| el.inner_html }
|
113
|
+
else
|
114
|
+
subdoc.map{|el| el.inner_text.strip }
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
121
|
+
# for matching an attribute of the first element of a document
|
122
|
+
# matching a selector.
|
123
|
+
class MatchAttribute < Matcher
|
124
|
+
|
125
|
+
attr_accessor :attribute
|
126
|
+
|
127
|
+
# Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
|
128
|
+
# <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
|
129
|
+
# with three arguments: the +selector+ which collects elements
|
130
|
+
# from an HTML document, an +attribute+ to extract, and
|
131
|
+
# (optionally) a +matcher+ to perform the matching.
|
132
|
+
def initialize selector, attribute, matcher=nil
|
133
|
+
super selector, matcher
|
134
|
+
self.attribute = attribute.to_s
|
135
|
+
end
|
136
|
+
|
137
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
138
|
+
# class was initialized with. If initialized with a +matcher+,
|
139
|
+
# then return the +matcher+'s match against the value of the
|
140
|
+
# +attribute+ this class was initialized with, else just return
|
141
|
+
# the value of the +attribute+.
|
142
|
+
#
|
143
|
+
# m = MatchAttribute.new('span#bio/a.homepage', 'href')
|
144
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
145
|
+
# # => 'http://foo.bar'
|
146
|
+
def match doc
|
147
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
148
|
+
val = doc.path_attr(selector, attribute)
|
149
|
+
matcher ? matcher.match(val) : val
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
154
|
+
# for using a regular expression to match against text in an HTML
|
155
|
+
# document.
|
156
|
+
class MatchRegexp < Matcher
|
157
|
+
|
158
|
+
attr_accessor :re
|
159
|
+
attr_accessor :options
|
160
|
+
|
161
|
+
# Use the regular expression +re+ to return captures from the
|
162
|
+
# elements collected by +selector+ (treated as text) used on an
|
163
|
+
# HTML document (if +selector+ is +nil+ then match against the
|
164
|
+
# full text of the document). If the keyword argument
|
165
|
+
# <tt>:capture</tt> is specified then return the corresponding
|
166
|
+
# group (indexing is that of regular expressions; "1" is the
|
167
|
+
# first capture), else return an array of all captures. If
|
168
|
+
# +matcher+, then use it on the capture(s) before returning.
|
169
|
+
#
|
170
|
+
# FIXME Shouldn't the matcher come BEFORE the regexp capture,
|
171
|
+
# not after?
|
172
|
+
def initialize selector, re, matcher=nil, options={}
|
173
|
+
super selector, matcher
|
174
|
+
self.options = options
|
175
|
+
self.re = re
|
176
|
+
end
|
177
|
+
|
178
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
179
|
+
# object was initialized with. Use the +re+ and the (optional)
|
180
|
+
# capture group this object was initialized with to capture a
|
181
|
+
# string (or array of strings if no capture group was specified)
|
182
|
+
# from the collected element (treated as text). If initialized
|
183
|
+
# with a +matcher+, then return the +matcher+'s match against
|
184
|
+
# the value of the capture(s), else just return the capture(s).
|
185
|
+
#
|
186
|
+
# m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
|
187
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
|
188
|
+
# # => "John Chimpo"
|
189
|
+
def match doc
|
190
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
191
|
+
el = selector ? doc.contents_of(selector) : doc
|
192
|
+
m = re.match(el.to_s)
|
193
|
+
val = case
|
194
|
+
when m.nil? then nil
|
195
|
+
when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
|
196
|
+
else m.captures
|
197
|
+
end
|
198
|
+
# pass to matcher, if any
|
199
|
+
matcher ? matcher.match(val) : val
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
class MatchRegexpRepeatedly < Matcher
|
205
|
+
attr_accessor :re
|
206
|
+
def initialize selector, re, matcher=nil
|
207
|
+
super selector, matcher
|
208
|
+
self.re = re
|
209
|
+
end
|
210
|
+
def match doc
|
211
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
212
|
+
# apply selector, if any
|
213
|
+
el = selector ? doc.contents_of(selector) : doc
|
214
|
+
return unless el
|
215
|
+
# get all matches
|
216
|
+
val = el.to_s.scan(re)
|
217
|
+
# if there's only one capture group, flatten the array
|
218
|
+
val = val.flatten if val.first && val.first.length == 1
|
219
|
+
# pass to matcher, if any
|
220
|
+
matcher ? matcher.match(val) : val
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Class for building a hash of values by using appropriate
|
225
|
+
# matchers against an HTML document.
|
226
|
+
class MatchHash
|
227
|
+
|
228
|
+
attr_accessor :match_hash
|
229
|
+
|
230
|
+
# The +match_hash+ must be a +Hash+ of symbols matched to HTML
|
231
|
+
# matchers (subclasses of
|
232
|
+
# <tt>IMW::HTMLParserMatcher::Matcher</tt>).
|
233
|
+
def initialize match_hash
|
234
|
+
# Kludge? maybe.
|
235
|
+
raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
|
236
|
+
self.match_hash = match_hash
|
237
|
+
end
|
238
|
+
|
239
|
+
# Use the +match_hash+ this +MatchHash+ was initialized with to
|
240
|
+
# select elements from +doc+ and extract information from them:
|
241
|
+
#
|
242
|
+
# m = MatchHash.new({
|
243
|
+
# :name => MatchFirstElement.new('li/span.customer'),
|
244
|
+
# :order_status => MatchAttribute.new('li/ul[@status]','status'),
|
245
|
+
# :products => MatchArray.new('li/ul/li')
|
246
|
+
# })
|
247
|
+
# m.match('<li><span class="customer">John Chimpo</span>
|
248
|
+
# <ul status="shipped">
|
249
|
+
# <li>bananas</li>
|
250
|
+
# <li>mangos</li>
|
251
|
+
# <li>banangos</li>
|
252
|
+
# </ul></li>')
|
253
|
+
# # => {
|
254
|
+
# :name => "John Chimpo",
|
255
|
+
# :order_status => "shipped",
|
256
|
+
# :products => ["bananas", "mangos", "banangos"]
|
257
|
+
# }
|
258
|
+
def match doc
|
259
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
260
|
+
hsh = { }
|
261
|
+
match_hash.each do |attr, m|
|
262
|
+
val = m.match(doc)
|
263
|
+
case attr
|
264
|
+
when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
|
265
|
+
else hsh[attr] = val end
|
266
|
+
end
|
267
|
+
self.class.scrub!(hsh)
|
268
|
+
end
|
269
|
+
|
270
|
+
# kill off keys with nil values
|
271
|
+
def self.scrub! hsh
|
272
|
+
hsh # .reject{|k,v| v.nil? }
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
#
|
277
|
+
# construct the downstream part of a hash matcher
|
278
|
+
#
|
279
|
+
def self.build_match_hash spec_hash
|
280
|
+
hsh = { }
|
281
|
+
spec_hash.each do |attr, spec|
|
282
|
+
hsh[attr] = build_parse_tree(spec)
|
283
|
+
end
|
284
|
+
hsh
|
285
|
+
end
|
286
|
+
|
287
|
+
#
|
288
|
+
# recursively build a tree of matchers
|
289
|
+
#
|
290
|
+
def self.build_parse_tree spec
|
291
|
+
case spec
|
292
|
+
when nil then nil
|
293
|
+
when Matcher then spec
|
294
|
+
when Hash then MatchHash.new(build_match_hash(spec))
|
295
|
+
when Array then
|
296
|
+
return nil if spec.empty?
|
297
|
+
raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
|
298
|
+
MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
|
299
|
+
when String then MatchFirstElement.new(spec)
|
300
|
+
when Proc then MatchProc.new(nil, spec)
|
301
|
+
when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
|
302
|
+
else raise "Don't know how to parse #{spec.inspect}"
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module IMW
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
# This is an abstract class for a line-oriented parser intended to
|
5
|
+
# read and emit lines sequentially from a file.
|
6
|
+
#
|
7
|
+
# To leverage the functionality of this class, subclass it and
|
8
|
+
# define a +parse_line+ method.
|
9
|
+
class LineParser
|
10
|
+
|
11
|
+
# The number of lines to skip on each file parsed.
|
12
|
+
attr_accessor :skip_first
|
13
|
+
|
14
|
+
# The class to parse each line into. The +new+ method of this
|
15
|
+
# class must accept a hash.
|
16
|
+
attr_accessor :klass
|
17
|
+
|
18
|
+
# If called with the option <tt>:skip_first</tt> then skip the
|
19
|
+
# corresponding number of lines at the beginning of the file when
|
20
|
+
# parsing.
|
21
|
+
def initialize options={}
|
22
|
+
@skip_first = options[:skip_first] || 0
|
23
|
+
@klass = options[:of] || options[:klass]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Parse the given file. If the option <tt>:lines</tt> is passed
|
27
|
+
# in then only parse that many lines. If given a block then
|
28
|
+
# yield the result of each line to the block; else just return
|
29
|
+
# an array of results.
|
30
|
+
#
|
31
|
+
# If this parser has a +klass+ attribute then each parsed line
|
32
|
+
# will first be turned into an instance of that class (the class
|
33
|
+
# must accept a hash of values in its initializer).
|
34
|
+
def parse! file, options={}, &block
|
35
|
+
skip_lines!(file)
|
36
|
+
if options[:lines]
|
37
|
+
case
|
38
|
+
when klass && block_given?
|
39
|
+
options[:lines].times do
|
40
|
+
yield klass.new(parse_line(file.readline))
|
41
|
+
end
|
42
|
+
when block_given?
|
43
|
+
options[:lines].times do
|
44
|
+
yield parse_line(file.readline)
|
45
|
+
end
|
46
|
+
when klass
|
47
|
+
options[:lines].times do
|
48
|
+
klass.new(parse_line(file.readline))
|
49
|
+
end
|
50
|
+
else
|
51
|
+
options[:lines].times.map do
|
52
|
+
parse_line(file.readline)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
case
|
57
|
+
when klass && block_given?
|
58
|
+
file.each do |line|
|
59
|
+
yield klass.new(parse_line(line))
|
60
|
+
end
|
61
|
+
when block_given?
|
62
|
+
file.each do |line|
|
63
|
+
yield parse_line(line)
|
64
|
+
end
|
65
|
+
when klass
|
66
|
+
file.map do |line|
|
67
|
+
klass.new(parse_line(line))
|
68
|
+
end
|
69
|
+
else
|
70
|
+
file.map do |line|
|
71
|
+
parse_line(line)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def parse_line line
|
78
|
+
raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
|
79
|
+
end
|
80
|
+
|
81
|
+
protected
|
82
|
+
def skip_lines! file
|
83
|
+
skip_first.times { file.readline }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'imw/parsers/line_parser'
|
2
|
+
|
3
|
+
module IMW
|
4
|
+
module Parsers
|
5
|
+
|
6
|
+
# A RegexpParser is a line-oriented parser which uses a regular
|
7
|
+
# expression to extract data from a line into either a hash or an
|
8
|
+
# object obeying hash semantics.
|
9
|
+
#
|
10
|
+
# As an example, a flat file with one record per line in the
|
11
|
+
# following format (this is a simplified version of common
|
12
|
+
# webserver log formats)
|
13
|
+
#
|
14
|
+
# 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
|
15
|
+
# 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
|
16
|
+
# 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
|
17
|
+
# ...
|
18
|
+
#
|
19
|
+
# could be parsed as follows
|
20
|
+
#
|
21
|
+
# file = File.new '/path/to/file.log'
|
22
|
+
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
23
|
+
# :into_fields => [:ip, :timestamp, :verb, :url, :version]
|
24
|
+
# parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
|
25
|
+
#
|
26
|
+
# Consecutive captures from the regular expression will be pushed
|
27
|
+
# into a hash with keys given by the +into_fields+ property of
|
28
|
+
# this parser.
|
29
|
+
#
|
30
|
+
# If the parser is instantiated with the <tt>:of</tt> keyword then
|
31
|
+
# the parsed hash from each line is used to instantiate a new
|
32
|
+
# object of the corresponding class:
|
33
|
+
#
|
34
|
+
# require 'ostruct'
|
35
|
+
#
|
36
|
+
# PageView = Class.new(OpenStruct)
|
37
|
+
#
|
38
|
+
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
39
|
+
# :into_fields => [:ip, :timestamp, :verb, :url, :version],
|
40
|
+
# :of => PageView
|
41
|
+
#
|
42
|
+
# parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
|
43
|
+
#
|
44
|
+
# The option <tt>:strictly</tt> can also be set to force the
|
45
|
+
# parser to raise an error if it finds a line which doesn't match
|
46
|
+
# its regexp.
|
47
|
+
class RegexpParser < LineParser
|
48
|
+
attr_accessor :regexp, :fields, :strict
|
49
|
+
|
50
|
+
def initialize options={}
|
51
|
+
@regexp = options[:regexp] || options[:by_regexp]
|
52
|
+
@fields = options[:fields] || options[:into_fields]
|
53
|
+
@strict = options[:strict] || options[:strictly]
|
54
|
+
super options
|
55
|
+
end
|
56
|
+
|
57
|
+
def parse_line line
|
58
|
+
match_data = regexp.match(line.chomp)
|
59
|
+
returning({}) do |hsh|
|
60
|
+
if match_data
|
61
|
+
match_data.captures.each_with_index do |capture, index|
|
62
|
+
hsh[fields[index]] = capture
|
63
|
+
end
|
64
|
+
else
|
65
|
+
raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|