imw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# h2. lib/imw/parsers/html_parser/matcher.rb -- utility classes for html parser
|
|
6
|
+
#
|
|
7
|
+
# == About
|
|
8
|
+
#
|
|
9
|
+
# This file defines the <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
|
10
|
+
# abstract class and some concrete subclasses which perform specific
|
|
11
|
+
# kinds of matches against HTML documents using the
|
|
12
|
+
# Hpricot[https://code.whytheluckystiff.net/hpricot/] library.
|
|
13
|
+
#
|
|
14
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
15
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
16
|
+
# License:: GPL 3.0
|
|
17
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
18
|
+
#
|
|
19
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
20
|
+
|
|
21
|
+
require 'imw/utils/extensions/hpricot'
|
|
22
|
+
|
|
23
|
+
module IMW
|
|
24
|
+
module HTMLParserMatcher
|
|
25
|
+
|
|
26
|
+
# An abstract class from which to subclass specific HTML matchers.
|
|
27
|
+
#
|
|
28
|
+
# A subclass is initialized with a +selector+ and an optional
|
|
29
|
+
# +matcher+. The +selector+ is an HTML path specification used to
|
|
30
|
+
# collect elements from the document. If initialized with a
|
|
31
|
+
# +matcher+, the +matcher+ is used to return match information
|
|
32
|
+
# from the elements; else the inner HTML is returned. Subclasses
|
|
33
|
+
# decide how the +selector+ will collect elements.
|
|
34
|
+
class Matcher
|
|
35
|
+
|
|
36
|
+
attr_accessor :selector
|
|
37
|
+
attr_accessor :matcher
|
|
38
|
+
attr_accessor :options
|
|
39
|
+
|
|
40
|
+
def initialize selector, matcher=nil, options={}
|
|
41
|
+
self.selector = selector
|
|
42
|
+
self.matcher = matcher
|
|
43
|
+
self.options = options
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def match doc
|
|
47
|
+
raise "Abstract class #{self.class}"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
|
53
|
+
# for matching against the first element of a document matching a
|
|
54
|
+
# selector.
|
|
55
|
+
class MatchFirstElement < Matcher
|
|
56
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
|
57
|
+
# class was initialized with. If initialized with a +matcher+,
|
|
58
|
+
# then return the +matcher+'s match against the first element,
|
|
59
|
+
# else just return the inner HTML of the first element.
|
|
60
|
+
#
|
|
61
|
+
# m = MatchFirstElement.new('span#bio/a.homepage')
|
|
62
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
|
63
|
+
# # => 'My Homepage'
|
|
64
|
+
def match doc
|
|
65
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
66
|
+
el = doc.at(selector) or return nil
|
|
67
|
+
if matcher
|
|
68
|
+
matcher.match(el)
|
|
69
|
+
else
|
|
70
|
+
options[:html] ? el.inner_html : el.inner_text.strip
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# FIXME is there really a need for this separate class? why can't
|
|
76
|
+
# MatchFirstElement.match accept a block?
|
|
77
|
+
class MatchProc < MatchFirstElement
|
|
78
|
+
attr_accessor :proc
|
|
79
|
+
attr_accessor :options
|
|
80
|
+
def initialize selector, proc, matcher=nil, options={}
|
|
81
|
+
super selector, matcher
|
|
82
|
+
self.options = options
|
|
83
|
+
self.proc = proc
|
|
84
|
+
end
|
|
85
|
+
def match doc
|
|
86
|
+
val = super doc
|
|
87
|
+
val ? self.proc.call(val) : self.proc.call(doc)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
|
92
|
+
# for matching each element of a document matching a selector.
|
|
93
|
+
class MatchArray < Matcher
|
|
94
|
+
# Grab each element from +doc+ matching the +selector+ this
|
|
95
|
+
# class was initialized with. If initialized with a +matcher+,
|
|
96
|
+
# then return an array consisting of the +matcher+'s match
|
|
97
|
+
# against each element, else just return an array consisting of
|
|
98
|
+
# the inner HTML of each element.
|
|
99
|
+
#
|
|
100
|
+
# m = MatchArray.new('span#bio/a.homepage')
|
|
101
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
|
|
102
|
+
# <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
|
|
103
|
+
# <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
|
|
104
|
+
# # => ["My Homepage", "Your Homepage", "Their Homepage"]
|
|
105
|
+
def match doc
|
|
106
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
107
|
+
subdoc = (doc/selector) or return nil
|
|
108
|
+
if matcher
|
|
109
|
+
subdoc.map{|el| matcher.match(el)}
|
|
110
|
+
else
|
|
111
|
+
if options[:html]
|
|
112
|
+
subdoc.map{|el| el.inner_html }
|
|
113
|
+
else
|
|
114
|
+
subdoc.map{|el| el.inner_text.strip }
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
|
121
|
+
# for matching an attribute of the first element of a document
|
|
122
|
+
# matching a selector.
|
|
123
|
+
class MatchAttribute < Matcher
|
|
124
|
+
|
|
125
|
+
attr_accessor :attribute
|
|
126
|
+
|
|
127
|
+
# Unlike <tt>IMW::HTMLParserMatcher::Matcher</tt>,
|
|
128
|
+
# <tt>IMW::HTMLParserMatcher::MatchAttribute</tt> is initialized
|
|
129
|
+
# with three arguments: the +selector+ which collects elements
|
|
130
|
+
# from an HTML document, an +attribute+ to extract, and
|
|
131
|
+
# (optionally) a +matcher+ to perform the matching.
|
|
132
|
+
def initialize selector, attribute, matcher=nil
|
|
133
|
+
super selector, matcher
|
|
134
|
+
self.attribute = attribute.to_s
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
|
138
|
+
# class was initialized with. If initialized with a +matcher+,
|
|
139
|
+
# then return the +matcher+'s match against the value of the
|
|
140
|
+
# +attribute+ this class was initialized with, else just return
|
|
141
|
+
# the value of the +attribute+.
|
|
142
|
+
#
|
|
143
|
+
# m = MatchAttribute.new('span#bio/a.homepage', 'href')
|
|
144
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
|
145
|
+
# # => 'http://foo.bar'
|
|
146
|
+
def match doc
|
|
147
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
148
|
+
val = doc.path_attr(selector, attribute)
|
|
149
|
+
matcher ? matcher.match(val) : val
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Concrete subclass of <tt>IMW::HTMLParserMatcher::Matcher</tt>
|
|
154
|
+
# for using a regular expression to match against text in an HTML
|
|
155
|
+
# document.
|
|
156
|
+
class MatchRegexp < Matcher
|
|
157
|
+
|
|
158
|
+
attr_accessor :re
|
|
159
|
+
attr_accessor :options
|
|
160
|
+
|
|
161
|
+
# Use the regular expression +re+ to return captures from the
|
|
162
|
+
# elements collected by +selector+ (treated as text) used on an
|
|
163
|
+
# HTML document (if +selector+ is +nil+ then match against the
|
|
164
|
+
# full text of the document). If the keyword argument
|
|
165
|
+
# <tt>:capture</tt> is specified then return the corresponding
|
|
166
|
+
# group (indexing is that of regular expressions; "1" is the
|
|
167
|
+
# first capture), else return an array of all captures. If
|
|
168
|
+
# +matcher+, then use it on the capture(s) before returning.
|
|
169
|
+
#
|
|
170
|
+
# FIXME Shouldn't the matcher come BEFORE the regexp capture,
|
|
171
|
+
# not after?
|
|
172
|
+
def initialize selector, re, matcher=nil, options={}
|
|
173
|
+
super selector, matcher
|
|
174
|
+
self.options = options
|
|
175
|
+
self.re = re
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Grab the first element from +doc+ matching the +selector+ this
|
|
179
|
+
# object was initialized with. Use the +re+ and the (optional)
|
|
180
|
+
# capture group this object was initialized with to capture a
|
|
181
|
+
# string (or array of strings if no capture group was specified)
|
|
182
|
+
# from the collected element (treated as text). If initialized
|
|
183
|
+
# with a +matcher+, then return the +matcher+'s match against
|
|
184
|
+
# the value of the capture(s), else just return the capture(s).
|
|
185
|
+
#
|
|
186
|
+
# m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
|
|
187
|
+
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
|
|
188
|
+
# # => "John Chimpo"
|
|
189
|
+
def match doc
|
|
190
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
191
|
+
el = selector ? doc.contents_of(selector) : doc
|
|
192
|
+
m = re.match(el.to_s)
|
|
193
|
+
val = case
|
|
194
|
+
when m.nil? then nil
|
|
195
|
+
when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
|
|
196
|
+
else m.captures
|
|
197
|
+
end
|
|
198
|
+
# pass to matcher, if any
|
|
199
|
+
matcher ? matcher.match(val) : val
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class MatchRegexpRepeatedly < Matcher
|
|
205
|
+
attr_accessor :re
|
|
206
|
+
def initialize selector, re, matcher=nil
|
|
207
|
+
super selector, matcher
|
|
208
|
+
self.re = re
|
|
209
|
+
end
|
|
210
|
+
def match doc
|
|
211
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
212
|
+
# apply selector, if any
|
|
213
|
+
el = selector ? doc.contents_of(selector) : doc
|
|
214
|
+
return unless el
|
|
215
|
+
# get all matches
|
|
216
|
+
val = el.to_s.scan(re)
|
|
217
|
+
# if there's only one capture group, flatten the array
|
|
218
|
+
val = val.flatten if val.first && val.first.length == 1
|
|
219
|
+
# pass to matcher, if any
|
|
220
|
+
matcher ? matcher.match(val) : val
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Class for building a hash of values by using appropriate
|
|
225
|
+
# matchers against an HTML document.
|
|
226
|
+
class MatchHash
|
|
227
|
+
|
|
228
|
+
attr_accessor :match_hash
|
|
229
|
+
|
|
230
|
+
# The +match_hash+ must be a +Hash+ of symbols matched to HTML
|
|
231
|
+
# matchers (subclasses of
|
|
232
|
+
# <tt>IMW::HTMLParserMatcher::Matcher</tt>).
|
|
233
|
+
def initialize match_hash
|
|
234
|
+
# Kludge? maybe.
|
|
235
|
+
raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
|
|
236
|
+
self.match_hash = match_hash
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# Use the +match_hash+ this +MatchHash+ was initialized with to
|
|
240
|
+
# select elements from +doc+ and extract information from them:
|
|
241
|
+
#
|
|
242
|
+
# m = MatchHash.new({
|
|
243
|
+
# :name => MatchFirstElement.new('li/span.customer'),
|
|
244
|
+
# :order_status => MatchAttribute.new('li/ul[@status]','status'),
|
|
245
|
+
# :products => MatchArray.new('li/ul/li')
|
|
246
|
+
# })
|
|
247
|
+
# m.match('<li><span class="customer">John Chimpo</span>
|
|
248
|
+
# <ul status="shipped">
|
|
249
|
+
# <li>bananas</li>
|
|
250
|
+
# <li>mangos</li>
|
|
251
|
+
# <li>banangos</li>
|
|
252
|
+
# </ul></li>')
|
|
253
|
+
# # => {
|
|
254
|
+
# :name => "John Chimpo",
|
|
255
|
+
# :order_status => "shipped",
|
|
256
|
+
# :products => ["bananas", "mangos", "banangos"]
|
|
257
|
+
# }
|
|
258
|
+
def match doc
|
|
259
|
+
doc = Hpricot(doc) if doc.is_a?(String)
|
|
260
|
+
hsh = { }
|
|
261
|
+
match_hash.each do |attr, m|
|
|
262
|
+
val = m.match(doc)
|
|
263
|
+
case attr
|
|
264
|
+
when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
|
|
265
|
+
else hsh[attr] = val end
|
|
266
|
+
end
|
|
267
|
+
self.class.scrub!(hsh)
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# kill off keys with nil values
|
|
271
|
+
def self.scrub! hsh
|
|
272
|
+
hsh # .reject{|k,v| v.nil? }
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
#
|
|
277
|
+
# construct the downstream part of a hash matcher
|
|
278
|
+
#
|
|
279
|
+
def self.build_match_hash spec_hash
|
|
280
|
+
hsh = { }
|
|
281
|
+
spec_hash.each do |attr, spec|
|
|
282
|
+
hsh[attr] = build_parse_tree(spec)
|
|
283
|
+
end
|
|
284
|
+
hsh
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
#
|
|
288
|
+
# recursively build a tree of matchers
|
|
289
|
+
#
|
|
290
|
+
def self.build_parse_tree spec
|
|
291
|
+
case spec
|
|
292
|
+
when nil then nil
|
|
293
|
+
when Matcher then spec
|
|
294
|
+
when Hash then MatchHash.new(build_match_hash(spec))
|
|
295
|
+
when Array then
|
|
296
|
+
return nil if spec.empty?
|
|
297
|
+
raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
|
|
298
|
+
MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
|
|
299
|
+
when String then MatchFirstElement.new(spec)
|
|
300
|
+
when Proc then MatchProc.new(nil, spec)
|
|
301
|
+
when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
|
|
302
|
+
else raise "Don't know how to parse #{spec.inspect}"
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Parsers
|
|
3
|
+
|
|
4
|
+
# This is an abstract class for a line-oriented parser intended to
|
|
5
|
+
# read and emit lines sequentially from a file.
|
|
6
|
+
#
|
|
7
|
+
# To leverage the functionality of this class, subclass it and
|
|
8
|
+
# define a +parse_line+ method.
|
|
9
|
+
class LineParser
|
|
10
|
+
|
|
11
|
+
# The number of lines to skip on each file parsed.
|
|
12
|
+
attr_accessor :skip_first
|
|
13
|
+
|
|
14
|
+
# The class to parse each line into. The +new+ method of this
|
|
15
|
+
# class must accept a hash.
|
|
16
|
+
attr_accessor :klass
|
|
17
|
+
|
|
18
|
+
# If called with the option <tt>:skip_first</tt> then skip the
|
|
19
|
+
# corresponding number of lines at the beginning of the file when
|
|
20
|
+
# parsing.
|
|
21
|
+
def initialize options={}
|
|
22
|
+
@skip_first = options[:skip_first] || 0
|
|
23
|
+
@klass = options[:of] || options[:klass]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Parse the given file. If the option <tt>:lines</tt> is passed
|
|
27
|
+
# in then only parse that many lines. If given a block then
|
|
28
|
+
# yield the result of each line to the block; else just return
|
|
29
|
+
# an array of results.
|
|
30
|
+
#
|
|
31
|
+
# If this parser has a +klass+ attribute then each parsed line
|
|
32
|
+
# will first be turned into an instance of that class (the class
|
|
33
|
+
# must accept a hash of values in its initializer).
|
|
34
|
+
def parse! file, options={}, &block
|
|
35
|
+
skip_lines!(file)
|
|
36
|
+
if options[:lines]
|
|
37
|
+
case
|
|
38
|
+
when klass && block_given?
|
|
39
|
+
options[:lines].times do
|
|
40
|
+
yield klass.new(parse_line(file.readline))
|
|
41
|
+
end
|
|
42
|
+
when block_given?
|
|
43
|
+
options[:lines].times do
|
|
44
|
+
yield parse_line(file.readline)
|
|
45
|
+
end
|
|
46
|
+
when klass
|
|
47
|
+
options[:lines].times do
|
|
48
|
+
klass.new(parse_line(file.readline))
|
|
49
|
+
end
|
|
50
|
+
else
|
|
51
|
+
options[:lines].times.map do
|
|
52
|
+
parse_line(file.readline)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
else
|
|
56
|
+
case
|
|
57
|
+
when klass && block_given?
|
|
58
|
+
file.each do |line|
|
|
59
|
+
yield klass.new(parse_line(line))
|
|
60
|
+
end
|
|
61
|
+
when block_given?
|
|
62
|
+
file.each do |line|
|
|
63
|
+
yield parse_line(line)
|
|
64
|
+
end
|
|
65
|
+
when klass
|
|
66
|
+
file.map do |line|
|
|
67
|
+
klass.new(parse_line(line))
|
|
68
|
+
end
|
|
69
|
+
else
|
|
70
|
+
file.map do |line|
|
|
71
|
+
parse_line(line)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def parse_line line
|
|
78
|
+
raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
protected
|
|
82
|
+
def skip_lines! file
|
|
83
|
+
skip_first.times { file.readline }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require 'imw/parsers/line_parser'
|
|
2
|
+
|
|
3
|
+
module IMW
|
|
4
|
+
module Parsers
|
|
5
|
+
|
|
6
|
+
# A RegexpParser is a line-oriented parser which uses a regular
|
|
7
|
+
# expression to extract data from a line into either a hash or an
|
|
8
|
+
# object obeying hash semantics.
|
|
9
|
+
#
|
|
10
|
+
# As an example, a flat file with one record per line in the
|
|
11
|
+
# following format (this is a simplified version of common
|
|
12
|
+
# webserver log formats)
|
|
13
|
+
#
|
|
14
|
+
# 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
|
|
15
|
+
# 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
|
|
16
|
+
# 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
|
|
17
|
+
# ...
|
|
18
|
+
#
|
|
19
|
+
# could be parsed as follows
|
|
20
|
+
#
|
|
21
|
+
# file = File.new '/path/to/file.log'
|
|
22
|
+
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
|
23
|
+
# :into_fields => [:ip, :timestamp, :verb, :url, :version]
|
|
24
|
+
# parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
|
|
25
|
+
#
|
|
26
|
+
# Consecutive captures from the regular expression will be pushed
|
|
27
|
+
# into a hash with keys given by the +into_fields+ property of
|
|
28
|
+
# this parser.
|
|
29
|
+
#
|
|
30
|
+
# If the parser is instantiated with the <tt>:of</tt> keyword then
|
|
31
|
+
# the parsed hash from each line is used to instantiate a new
|
|
32
|
+
# object of the corresponding class:
|
|
33
|
+
#
|
|
34
|
+
# require 'ostruct'
|
|
35
|
+
#
|
|
36
|
+
# PageView = Class.new(OpenStruct)
|
|
37
|
+
#
|
|
38
|
+
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
|
39
|
+
# :into_fields => [:ip, :timestamp, :verb, :url, :version],
|
|
40
|
+
# :of => PageView
|
|
41
|
+
#
|
|
42
|
+
# parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
|
|
43
|
+
#
|
|
44
|
+
# The option <tt>:strictly</tt> can also be set to force the
|
|
45
|
+
# parser to raise an error if it finds a line which doesn't match
|
|
46
|
+
# its regexp.
|
|
47
|
+
class RegexpParser < LineParser
|
|
48
|
+
attr_accessor :regexp, :fields, :strict
|
|
49
|
+
|
|
50
|
+
def initialize options={}
|
|
51
|
+
@regexp = options[:regexp] || options[:by_regexp]
|
|
52
|
+
@fields = options[:fields] || options[:into_fields]
|
|
53
|
+
@strict = options[:strict] || options[:strictly]
|
|
54
|
+
super options
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def parse_line line
|
|
58
|
+
match_data = regexp.match(line.chomp)
|
|
59
|
+
returning({}) do |hsh|
|
|
60
|
+
if match_data
|
|
61
|
+
match_data.captures.each_with_index do |capture, index|
|
|
62
|
+
hsh[fields[index]] = capture
|
|
63
|
+
end
|
|
64
|
+
else
|
|
65
|
+
raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|