imw 0.2.18 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,289 +0,0 @@
|
|
1
|
-
require 'imw/utils/extensions/hpricot'
|
2
|
-
|
3
|
-
module IMW
|
4
|
-
module Parsers
|
5
|
-
module HtmlMatchers
|
6
|
-
|
7
|
-
# An abstract class from which to subclass specific HTML matchers.
|
8
|
-
#
|
9
|
-
# A subclass is initialized with a +selector+ and an optional
|
10
|
-
# +matcher+. The +selector+ is an HTML path specification used to
|
11
|
-
# collect elements from the document. If initialized with a
|
12
|
-
# +matcher+, the +matcher+ is used to return match information
|
13
|
-
# from the elements; else the inner HTML is returned. Subclasses
|
14
|
-
# decide how the +selector+ will collect elements.
|
15
|
-
class Matcher
|
16
|
-
|
17
|
-
attr_accessor :selector
|
18
|
-
attr_accessor :matcher
|
19
|
-
attr_accessor :options
|
20
|
-
|
21
|
-
def initialize selector, matcher=nil, options={}
|
22
|
-
self.selector = selector
|
23
|
-
self.matcher = matcher
|
24
|
-
self.options = options
|
25
|
-
end
|
26
|
-
|
27
|
-
def match doc
|
28
|
-
raise "Abstract class #{self.class}"
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
# Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
|
34
|
-
# for matching against the first element of a document matching a
|
35
|
-
# selector.
|
36
|
-
class MatchFirstElement < Matcher
|
37
|
-
# Grab the first element from +doc+ matching the +selector+ this
|
38
|
-
# class was initialized with. If initialized with a +matcher+,
|
39
|
-
# then return the +matcher+'s match against the first element,
|
40
|
-
# else just return the inner HTML of the first element.
|
41
|
-
#
|
42
|
-
# m = MatchFirstElement.new('span#bio/a.homepage')
|
43
|
-
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
44
|
-
# # => 'My Homepage'
|
45
|
-
def match doc
|
46
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
47
|
-
el = doc.at(selector) or return nil
|
48
|
-
if matcher
|
49
|
-
matcher.match(el)
|
50
|
-
else
|
51
|
-
options[:html] ? el : el.inner_text.strip
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# FIXME is there really a need for this separate class? why can't
|
57
|
-
# MatchFirstElement.match accept a block?
|
58
|
-
class MatchProc < MatchFirstElement
|
59
|
-
attr_accessor :proc
|
60
|
-
attr_accessor :options
|
61
|
-
def initialize selector, proc, matcher=nil, options={}
|
62
|
-
super selector, matcher
|
63
|
-
self.options = options
|
64
|
-
self.proc = proc
|
65
|
-
end
|
66
|
-
def match doc
|
67
|
-
val = super doc
|
68
|
-
val ? self.proc.call(val) : self.proc.call(doc)
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
|
73
|
-
# for matching each element of a document matching a selector.
|
74
|
-
class MatchArray < Matcher
|
75
|
-
# Grab each element from +doc+ matching the +selector+ this
|
76
|
-
# class was initialized with. If initialized with a +matcher+,
|
77
|
-
# then return an array consisting of the +matcher+'s match
|
78
|
-
# against each element, else just return an array consisting of
|
79
|
-
# the inner HTML of each element.
|
80
|
-
#
|
81
|
-
# m = MatchArray.new('span#bio/a.homepage')
|
82
|
-
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
|
83
|
-
# <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
|
84
|
-
# <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
|
85
|
-
# # => ["My Homepage", "Your Homepage", "Their Homepage"]
|
86
|
-
def match doc
|
87
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
88
|
-
subdoc = (doc/selector) or return nil
|
89
|
-
if matcher
|
90
|
-
subdoc.map{|el| matcher.match(el)}
|
91
|
-
else
|
92
|
-
if options[:html]
|
93
|
-
subdoc.map{|el| el }
|
94
|
-
else
|
95
|
-
subdoc.map{|el| el.inner_text.strip }
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
# Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
|
102
|
-
# for matching an attribute of the first element of a document
|
103
|
-
# matching a selector.
|
104
|
-
class MatchAttribute < Matcher
|
105
|
-
|
106
|
-
attr_accessor :attribute
|
107
|
-
|
108
|
-
# Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>,
|
109
|
-
# <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized
|
110
|
-
# with three arguments: the +selector+ which collects elements
|
111
|
-
# from an HTML document, an +attribute+ to extract, and
|
112
|
-
# (optionally) a +matcher+ to perform the matching.
|
113
|
-
def initialize selector, attribute, matcher=nil
|
114
|
-
super selector, matcher
|
115
|
-
self.attribute = attribute.to_s
|
116
|
-
end
|
117
|
-
|
118
|
-
# Grab the first element from +doc+ matching the +selector+ this
|
119
|
-
# class was initialized with. If initialized with a +matcher+,
|
120
|
-
# then return the +matcher+'s match against the value of the
|
121
|
-
# +attribute+ this class was initialized with, else just return
|
122
|
-
# the value of the +attribute+.
|
123
|
-
#
|
124
|
-
# m = MatchAttribute.new('span#bio/a.homepage', 'href')
|
125
|
-
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
|
126
|
-
# # => 'http://foo.bar'
|
127
|
-
def match doc
|
128
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
129
|
-
val = doc.path_attr(selector, attribute)
|
130
|
-
matcher ? matcher.match(val) : val
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
|
135
|
-
# for using a regular expression to match against text in an HTML
|
136
|
-
# document.
|
137
|
-
class MatchRegexp < Matcher
|
138
|
-
|
139
|
-
attr_accessor :re
|
140
|
-
attr_accessor :options
|
141
|
-
|
142
|
-
# Use the regular expression +re+ to return captures from the
|
143
|
-
# elements collected by +selector+ (treated as text) used on an
|
144
|
-
# HTML document (if +selector+ is +nil+ then match against the
|
145
|
-
# full text of the document). If the keyword argument
|
146
|
-
# <tt>:capture</tt> is specified then return the corresponding
|
147
|
-
# group (indexing is that of regular expressions; "1" is the
|
148
|
-
# first capture), else return an array of all captures. If
|
149
|
-
# +matcher+, then use it on the capture(s) before returning.
|
150
|
-
#
|
151
|
-
# FIXME Shouldn't the matcher come BEFORE the regexp capture,
|
152
|
-
# not after?
|
153
|
-
def initialize selector, re, matcher=nil, options={}
|
154
|
-
super selector, matcher
|
155
|
-
self.options = options
|
156
|
-
self.re = re
|
157
|
-
end
|
158
|
-
|
159
|
-
# Grab the first element from +doc+ matching the +selector+ this
|
160
|
-
# object was initialized with. Use the +re+ and the (optional)
|
161
|
-
# capture group this object was initialized with to capture a
|
162
|
-
# string (or array of strings if no capture group was specified)
|
163
|
-
# from the collected element (treated as text). If initialized
|
164
|
-
# with a +matcher+, then return the +matcher+'s match against
|
165
|
-
# the value of the capture(s), else just return the capture(s).
|
166
|
-
#
|
167
|
-
# m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
|
168
|
-
# m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
|
169
|
-
# # => "John Chimpo"
|
170
|
-
def match doc
|
171
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
172
|
-
el = selector ? doc.contents_of(selector) : doc
|
173
|
-
m = re.match(el.to_s)
|
174
|
-
val = case
|
175
|
-
when m.nil? then nil
|
176
|
-
when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
|
177
|
-
else m.captures
|
178
|
-
end
|
179
|
-
# pass to matcher, if any
|
180
|
-
matcher ? matcher.match(val) : val
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
|
185
|
-
class MatchRegexpRepeatedly < Matcher
|
186
|
-
attr_accessor :re
|
187
|
-
def initialize selector, re, matcher=nil
|
188
|
-
super selector, matcher
|
189
|
-
self.re = re
|
190
|
-
end
|
191
|
-
def match doc
|
192
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
193
|
-
# apply selector, if any
|
194
|
-
el = selector ? doc.contents_of(selector) : doc
|
195
|
-
return unless el
|
196
|
-
# get all matches
|
197
|
-
val = el.to_s.scan(re)
|
198
|
-
# if there's only one capture group, flatten the array
|
199
|
-
val = val.flatten if val.first && val.first.length == 1
|
200
|
-
# pass to matcher, if any
|
201
|
-
matcher ? matcher.match(val) : val
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
# Class for building a hash of values by using appropriate
|
206
|
-
# matchers against an HTML document.
|
207
|
-
class MatchHash
|
208
|
-
|
209
|
-
attr_accessor :match_hash
|
210
|
-
|
211
|
-
# The +match_hash+ must be a +Hash+ of symbols matched to HTML
|
212
|
-
# matchers (subclasses of
|
213
|
-
# <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>).
|
214
|
-
def initialize match_hash
|
215
|
-
# Kludge? maybe.
|
216
|
-
raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
|
217
|
-
self.match_hash = match_hash
|
218
|
-
end
|
219
|
-
|
220
|
-
# Use the +match_hash+ this +MatchHash+ was initialized with to
|
221
|
-
# select elements from +doc+ and extract information from them:
|
222
|
-
#
|
223
|
-
# m = MatchHash.new({
|
224
|
-
# :name => MatchFirstElement.new('li/span.customer'),
|
225
|
-
# :order_status => MatchAttribute.new('li/ul[@status]','status'),
|
226
|
-
# :products => MatchArray.new('li/ul/li')
|
227
|
-
# })
|
228
|
-
# m.match('<li><span class="customer">John Chimpo</span>
|
229
|
-
# <ul status="shipped">
|
230
|
-
# <li>bananas</li>
|
231
|
-
# <li>mangos</li>
|
232
|
-
# <li>banangos</li>
|
233
|
-
# </ul></li>')
|
234
|
-
# # => {
|
235
|
-
# :name => "John Chimpo",
|
236
|
-
# :order_status => "shipped",
|
237
|
-
# :products => ["bananas", "mangos", "banangos"]
|
238
|
-
# }
|
239
|
-
def match doc
|
240
|
-
doc = Hpricot(doc) if doc.is_a?(String)
|
241
|
-
hsh = { }
|
242
|
-
match_hash.each do |attr, m|
|
243
|
-
val = m.match(doc)
|
244
|
-
case attr
|
245
|
-
when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
|
246
|
-
else hsh[attr] = val end
|
247
|
-
end
|
248
|
-
self.class.scrub!(hsh)
|
249
|
-
end
|
250
|
-
|
251
|
-
# kill off keys with nil values
|
252
|
-
def self.scrub! hsh
|
253
|
-
hsh # .reject{|k,v| v.nil? }
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
#
|
258
|
-
# construct the downstream part of a hash matcher
|
259
|
-
#
|
260
|
-
def self.build_match_hash spec_hash
|
261
|
-
hsh = { }
|
262
|
-
spec_hash.each do |attr, spec|
|
263
|
-
hsh[attr] = build_parse_tree(spec)
|
264
|
-
end
|
265
|
-
hsh
|
266
|
-
end
|
267
|
-
|
268
|
-
#
|
269
|
-
# recursively build a tree of matchers
|
270
|
-
#
|
271
|
-
def self.build_parse_tree spec
|
272
|
-
case spec
|
273
|
-
when nil then nil
|
274
|
-
when Matcher then spec
|
275
|
-
when Hash then MatchHash.new(build_match_hash(spec))
|
276
|
-
when Array then
|
277
|
-
return nil if spec.empty?
|
278
|
-
raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
|
279
|
-
MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
|
280
|
-
when String then MatchFirstElement.new(spec)
|
281
|
-
when Proc then MatchProc.new(nil, spec)
|
282
|
-
when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
|
283
|
-
when Symbol then MatchAttribute.new(nil, spec, nil)
|
284
|
-
else raise "Don't know how to parse #{spec.inspect}"
|
285
|
-
end
|
286
|
-
end
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
module Parsers
|
3
|
-
|
4
|
-
# This is an abstract class for a line-oriented parser intended to
|
5
|
-
# read and emit lines sequentially from a file.
|
6
|
-
#
|
7
|
-
# To leverage the functionality of this class, subclass it and
|
8
|
-
# define a +parse_line+ method.
|
9
|
-
class LineParser
|
10
|
-
|
11
|
-
# The number of lines to skip on each file parsed.
|
12
|
-
attr_accessor :skip_first
|
13
|
-
|
14
|
-
# The class to parse each line into. The +new+ method of this
|
15
|
-
# class must accept a hash.
|
16
|
-
attr_accessor :klass
|
17
|
-
|
18
|
-
# If called with the option <tt>:skip_first</tt> then skip the
|
19
|
-
# corresponding number of lines at the beginning of the file when
|
20
|
-
# parsing.
|
21
|
-
def initialize options={}
|
22
|
-
@skip_first = options[:skip_first] || 0
|
23
|
-
@klass = options[:of] || options[:klass]
|
24
|
-
end
|
25
|
-
|
26
|
-
# Parse the given file. If the option <tt>:lines</tt> is passed
|
27
|
-
# in then only parse that many lines. If given a block then
|
28
|
-
# yield the result of each line to the block; else just return
|
29
|
-
# an array of results.
|
30
|
-
#
|
31
|
-
# If this parser has a +klass+ attribute then each parsed line
|
32
|
-
# will first be turned into an instance of that class (the class
|
33
|
-
# must accept a hash of values in its initializer).
|
34
|
-
def parse! file, options={}, &block
|
35
|
-
skip_lines!(file)
|
36
|
-
if options[:lines]
|
37
|
-
case
|
38
|
-
when klass && block_given?
|
39
|
-
options[:lines].times do
|
40
|
-
yield klass.new(parse_line(file.readline))
|
41
|
-
end
|
42
|
-
when block_given?
|
43
|
-
options[:lines].times do
|
44
|
-
yield parse_line(file.readline)
|
45
|
-
end
|
46
|
-
when klass
|
47
|
-
options[:lines].times do
|
48
|
-
klass.new(parse_line(file.readline))
|
49
|
-
end
|
50
|
-
else
|
51
|
-
options[:lines].times.map do
|
52
|
-
parse_line(file.readline)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
else
|
56
|
-
case
|
57
|
-
when klass && block_given?
|
58
|
-
file.each do |line|
|
59
|
-
yield klass.new(parse_line(line))
|
60
|
-
end
|
61
|
-
when block_given?
|
62
|
-
file.each do |line|
|
63
|
-
yield parse_line(line)
|
64
|
-
end
|
65
|
-
when klass
|
66
|
-
file.map do |line|
|
67
|
-
klass.new(parse_line(line))
|
68
|
-
end
|
69
|
-
else
|
70
|
-
file.map do |line|
|
71
|
-
parse_line(line)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def parse_line line
|
78
|
-
raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
|
79
|
-
end
|
80
|
-
|
81
|
-
protected
|
82
|
-
def skip_lines! file
|
83
|
-
skip_first.times { file.readline }
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'imw/parsers/line_parser'
|
2
|
-
|
3
|
-
module IMW
|
4
|
-
module Parsers
|
5
|
-
|
6
|
-
# A RegexpParser is a line-oriented parser which uses a regular
|
7
|
-
# expression to extract data from a line into either a hash or an
|
8
|
-
# object obeying hash semantics.
|
9
|
-
#
|
10
|
-
# As an example, a flat file with one record per line in the
|
11
|
-
# following format (this is a simplified version of common
|
12
|
-
# webserver log formats)
|
13
|
-
#
|
14
|
-
# 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
|
15
|
-
# 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
|
16
|
-
# 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
|
17
|
-
# ...
|
18
|
-
#
|
19
|
-
# could be parsed as follows
|
20
|
-
#
|
21
|
-
# file = File.new '/path/to/file.log'
|
22
|
-
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
23
|
-
# :into_fields => [:ip, :timestamp, :verb, :url, :version]
|
24
|
-
# parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
|
25
|
-
#
|
26
|
-
# Consecutive captures from the regular expression will be pushed
|
27
|
-
# into a hash with keys given by the +into_fields+ property of
|
28
|
-
# this parser.
|
29
|
-
#
|
30
|
-
# If the parser is instantiated with the <tt>:of</tt> keyword then
|
31
|
-
# the parsed hash from each line is used to instantiate a new
|
32
|
-
# object of the corresponding class:
|
33
|
-
#
|
34
|
-
# require 'ostruct'
|
35
|
-
#
|
36
|
-
# PageView = Class.new(OpenStruct)
|
37
|
-
#
|
38
|
-
# parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
|
39
|
-
# :into_fields => [:ip, :timestamp, :verb, :url, :version],
|
40
|
-
# :of => PageView
|
41
|
-
#
|
42
|
-
# parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
|
43
|
-
#
|
44
|
-
# The option <tt>:strictly</tt> can also be set to force the
|
45
|
-
# parser to raise an error if it finds a line which doesn't match
|
46
|
-
# its regexp.
|
47
|
-
class RegexpParser < LineParser
|
48
|
-
attr_accessor :regexp, :fields, :strict
|
49
|
-
|
50
|
-
def initialize options={}
|
51
|
-
@regexp = options[:regexp] || options[:by_regexp]
|
52
|
-
@fields = options[:fields] || options[:into_fields]
|
53
|
-
@strict = options[:strict] || options[:strictly]
|
54
|
-
super options
|
55
|
-
end
|
56
|
-
|
57
|
-
def parse_line line
|
58
|
-
match_data = regexp.match(line.chomp)
|
59
|
-
{}.tap do |hsh|
|
60
|
-
if match_data
|
61
|
-
match_data.captures.each_with_index do |capture, index|
|
62
|
-
hsh[fields[index]] = capture
|
63
|
-
end
|
64
|
-
else
|
65
|
-
raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|