imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,289 +0,0 @@
1
- require 'imw/utils/extensions/hpricot'
2
-
3
- module IMW
4
- module Parsers
5
- module HtmlMatchers
6
-
7
- # An abstract class from which to subclass specific HTML matchers.
8
- #
9
- # A subclass is initialized with a +selector+ and an optional
10
- # +matcher+. The +selector+ is an HTML path specification used to
11
- # collect elements from the document. If initialized with a
12
- # +matcher+, the +matcher+ is used to return match information
13
- # from the elements; else the inner HTML is returned. Subclasses
14
- # decide how the +selector+ will collect elements.
15
- class Matcher
16
-
17
- attr_accessor :selector
18
- attr_accessor :matcher
19
- attr_accessor :options
20
-
21
- def initialize selector, matcher=nil, options={}
22
- self.selector = selector
23
- self.matcher = matcher
24
- self.options = options
25
- end
26
-
27
- def match doc
28
- raise "Abstract class #{self.class}"
29
- end
30
-
31
- end
32
-
33
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
34
- # for matching against the first element of a document matching a
35
- # selector.
36
- class MatchFirstElement < Matcher
37
- # Grab the first element from +doc+ matching the +selector+ this
38
- # class was initialized with. If initialized with a +matcher+,
39
- # then return the +matcher+'s match against the first element,
40
- # else just return the inner HTML of the first element.
41
- #
42
- # m = MatchFirstElement.new('span#bio/a.homepage')
43
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
44
- # # => 'My Homepage'
45
- def match doc
46
- doc = Hpricot(doc) if doc.is_a?(String)
47
- el = doc.at(selector) or return nil
48
- if matcher
49
- matcher.match(el)
50
- else
51
- options[:html] ? el : el.inner_text.strip
52
- end
53
- end
54
- end
55
-
56
- # FIXME is there really a need for this separate class? why can't
57
- # MatchFirstElement.match accept a block?
58
- class MatchProc < MatchFirstElement
59
- attr_accessor :proc
60
- attr_accessor :options
61
- def initialize selector, proc, matcher=nil, options={}
62
- super selector, matcher
63
- self.options = options
64
- self.proc = proc
65
- end
66
- def match doc
67
- val = super doc
68
- val ? self.proc.call(val) : self.proc.call(doc)
69
- end
70
- end
71
-
72
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
73
- # for matching each element of a document matching a selector.
74
- class MatchArray < Matcher
75
- # Grab each element from +doc+ matching the +selector+ this
76
- # class was initialized with. If initialized with a +matcher+,
77
- # then return an array consisting of the +matcher+'s match
78
- # against each element, else just return an array consisting of
79
- # the inner HTML of each element.
80
- #
81
- # m = MatchArray.new('span#bio/a.homepage')
82
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
83
- # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
84
- # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
85
- # # => ["My Homepage", "Your Homepage", "Their Homepage"]
86
- def match doc
87
- doc = Hpricot(doc) if doc.is_a?(String)
88
- subdoc = (doc/selector) or return nil
89
- if matcher
90
- subdoc.map{|el| matcher.match(el)}
91
- else
92
- if options[:html]
93
- subdoc.map{|el| el }
94
- else
95
- subdoc.map{|el| el.inner_text.strip }
96
- end
97
- end
98
- end
99
- end
100
-
101
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
102
- # for matching an attribute of the first element of a document
103
- # matching a selector.
104
- class MatchAttribute < Matcher
105
-
106
- attr_accessor :attribute
107
-
108
- # Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>,
109
- # <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized
110
- # with three arguments: the +selector+ which collects elements
111
- # from an HTML document, an +attribute+ to extract, and
112
- # (optionally) a +matcher+ to perform the matching.
113
- def initialize selector, attribute, matcher=nil
114
- super selector, matcher
115
- self.attribute = attribute.to_s
116
- end
117
-
118
- # Grab the first element from +doc+ matching the +selector+ this
119
- # class was initialized with. If initialized with a +matcher+,
120
- # then return the +matcher+'s match against the value of the
121
- # +attribute+ this class was initialized with, else just return
122
- # the value of the +attribute+.
123
- #
124
- # m = MatchAttribute.new('span#bio/a.homepage', 'href')
125
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
126
- # # => 'http://foo.bar'
127
- def match doc
128
- doc = Hpricot(doc) if doc.is_a?(String)
129
- val = doc.path_attr(selector, attribute)
130
- matcher ? matcher.match(val) : val
131
- end
132
- end
133
-
134
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
135
- # for using a regular expression to match against text in an HTML
136
- # document.
137
- class MatchRegexp < Matcher
138
-
139
- attr_accessor :re
140
- attr_accessor :options
141
-
142
- # Use the regular expression +re+ to return captures from the
143
- # elements collected by +selector+ (treated as text) used on an
144
- # HTML document (if +selector+ is +nil+ then match against the
145
- # full text of the document). If the keyword argument
146
- # <tt>:capture</tt> is specified then return the corresponding
147
- # group (indexing is that of regular expressions; "1" is the
148
- # first capture), else return an array of all captures. If
149
- # +matcher+, then use it on the capture(s) before returning.
150
- #
151
- # FIXME Shouldn't the matcher come BEFORE the regexp capture,
152
- # not after?
153
- def initialize selector, re, matcher=nil, options={}
154
- super selector, matcher
155
- self.options = options
156
- self.re = re
157
- end
158
-
159
- # Grab the first element from +doc+ matching the +selector+ this
160
- # object was initialized with. Use the +re+ and the (optional)
161
- # capture group this object was initialized with to capture a
162
- # string (or array of strings if no capture group was specified)
163
- # from the collected element (treated as text). If initialized
164
- # with a +matcher+, then return the +matcher+'s match against
165
- # the value of the capture(s), else just return the capture(s).
166
- #
167
- # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
168
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
169
- # # => "John Chimpo"
170
- def match doc
171
- doc = Hpricot(doc) if doc.is_a?(String)
172
- el = selector ? doc.contents_of(selector) : doc
173
- m = re.match(el.to_s)
174
- val = case
175
- when m.nil? then nil
176
- when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
177
- else m.captures
178
- end
179
- # pass to matcher, if any
180
- matcher ? matcher.match(val) : val
181
- end
182
- end
183
-
184
-
185
- class MatchRegexpRepeatedly < Matcher
186
- attr_accessor :re
187
- def initialize selector, re, matcher=nil
188
- super selector, matcher
189
- self.re = re
190
- end
191
- def match doc
192
- doc = Hpricot(doc) if doc.is_a?(String)
193
- # apply selector, if any
194
- el = selector ? doc.contents_of(selector) : doc
195
- return unless el
196
- # get all matches
197
- val = el.to_s.scan(re)
198
- # if there's only one capture group, flatten the array
199
- val = val.flatten if val.first && val.first.length == 1
200
- # pass to matcher, if any
201
- matcher ? matcher.match(val) : val
202
- end
203
- end
204
-
205
- # Class for building a hash of values by using appropriate
206
- # matchers against an HTML document.
207
- class MatchHash
208
-
209
- attr_accessor :match_hash
210
-
211
- # The +match_hash+ must be a +Hash+ of symbols matched to HTML
212
- # matchers (subclasses of
213
- # <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>).
214
- def initialize match_hash
215
- # Kludge? maybe.
216
- raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
217
- self.match_hash = match_hash
218
- end
219
-
220
- # Use the +match_hash+ this +MatchHash+ was initialized with to
221
- # select elements from +doc+ and extract information from them:
222
- #
223
- # m = MatchHash.new({
224
- # :name => MatchFirstElement.new('li/span.customer'),
225
- # :order_status => MatchAttribute.new('li/ul[@status]','status'),
226
- # :products => MatchArray.new('li/ul/li')
227
- # })
228
- # m.match('<li><span class="customer">John Chimpo</span>
229
- # <ul status="shipped">
230
- # <li>bananas</li>
231
- # <li>mangos</li>
232
- # <li>banangos</li>
233
- # </ul></li>')
234
- # # => {
235
- # :name => "John Chimpo",
236
- # :order_status => "shipped",
237
- # :products => ["bananas", "mangos", "banangos"]
238
- # }
239
- def match doc
240
- doc = Hpricot(doc) if doc.is_a?(String)
241
- hsh = { }
242
- match_hash.each do |attr, m|
243
- val = m.match(doc)
244
- case attr
245
- when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
246
- else hsh[attr] = val end
247
- end
248
- self.class.scrub!(hsh)
249
- end
250
-
251
- # kill off keys with nil values
252
- def self.scrub! hsh
253
- hsh # .reject{|k,v| v.nil? }
254
- end
255
- end
256
-
257
- #
258
- # construct the downstream part of a hash matcher
259
- #
260
- def self.build_match_hash spec_hash
261
- hsh = { }
262
- spec_hash.each do |attr, spec|
263
- hsh[attr] = build_parse_tree(spec)
264
- end
265
- hsh
266
- end
267
-
268
- #
269
- # recursively build a tree of matchers
270
- #
271
- def self.build_parse_tree spec
272
- case spec
273
- when nil then nil
274
- when Matcher then spec
275
- when Hash then MatchHash.new(build_match_hash(spec))
276
- when Array then
277
- return nil if spec.empty?
278
- raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
279
- MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
280
- when String then MatchFirstElement.new(spec)
281
- when Proc then MatchProc.new(nil, spec)
282
- when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
283
- when Symbol then MatchAttribute.new(nil, spec, nil)
284
- else raise "Don't know how to parse #{spec.inspect}"
285
- end
286
- end
287
- end
288
- end
289
- end
@@ -1,87 +0,0 @@
1
- module IMW
2
- module Parsers
3
-
4
- # This is an abstract class for a line-oriented parser intended to
5
- # read and emit lines sequentially from a file.
6
- #
7
- # To leverage the functionality of this class, subclass it and
8
- # define a +parse_line+ method.
9
- class LineParser
10
-
11
- # The number of lines to skip on each file parsed.
12
- attr_accessor :skip_first
13
-
14
- # The class to parse each line into. The +new+ method of this
15
- # class must accept a hash.
16
- attr_accessor :klass
17
-
18
- # If called with the option <tt>:skip_first</tt> then skip the
19
- # corresponding number of lines at the beginning of the file when
20
- # parsing.
21
- def initialize options={}
22
- @skip_first = options[:skip_first] || 0
23
- @klass = options[:of] || options[:klass]
24
- end
25
-
26
- # Parse the given file. If the option <tt>:lines</tt> is passed
27
- # in then only parse that many lines. If given a block then
28
- # yield the result of each line to the block; else just return
29
- # an array of results.
30
- #
31
- # If this parser has a +klass+ attribute then each parsed line
32
- # will first be turned into an instance of that class (the class
33
- # must accept a hash of values in its initializer).
34
- def parse! file, options={}, &block
35
- skip_lines!(file)
36
- if options[:lines]
37
- case
38
- when klass && block_given?
39
- options[:lines].times do
40
- yield klass.new(parse_line(file.readline))
41
- end
42
- when block_given?
43
- options[:lines].times do
44
- yield parse_line(file.readline)
45
- end
46
- when klass
47
- options[:lines].times do
48
- klass.new(parse_line(file.readline))
49
- end
50
- else
51
- options[:lines].times.map do
52
- parse_line(file.readline)
53
- end
54
- end
55
- else
56
- case
57
- when klass && block_given?
58
- file.each do |line|
59
- yield klass.new(parse_line(line))
60
- end
61
- when block_given?
62
- file.each do |line|
63
- yield parse_line(line)
64
- end
65
- when klass
66
- file.map do |line|
67
- klass.new(parse_line(line))
68
- end
69
- else
70
- file.map do |line|
71
- parse_line(line)
72
- end
73
- end
74
- end
75
- end
76
-
77
- def parse_line line
78
- raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
79
- end
80
-
81
- protected
82
- def skip_lines! file
83
- skip_first.times { file.readline }
84
- end
85
- end
86
- end
87
- end
@@ -1,72 +0,0 @@
1
- require 'imw/parsers/line_parser'
2
-
3
- module IMW
4
- module Parsers
5
-
6
- # A RegexpParser is a line-oriented parser which uses a regular
7
- # expression to extract data from a line into either a hash or an
8
- # object obeying hash semantics.
9
- #
10
- # As an example, a flat file with one record per line in the
11
- # following format (this is a simplified version of common
12
- # webserver log formats)
13
- #
14
- # 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
15
- # 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
16
- # 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
17
- # ...
18
- #
19
- # could be parsed as follows
20
- #
21
- # file = File.new '/path/to/file.log'
22
- # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
23
- # :into_fields => [:ip, :timestamp, :verb, :url, :version]
24
- # parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
25
- #
26
- # Consecutive captures from the regular expression will be pushed
27
- # into a hash with keys given by the +into_fields+ property of
28
- # this parser.
29
- #
30
- # If the parser is instantiated with the <tt>:of</tt> keyword then
31
- # the parsed hash from each line is used to instantiate a new
32
- # object of the corresponding class:
33
- #
34
- # require 'ostruct'
35
- #
36
- # PageView = Class.new(OpenStruct)
37
- #
38
- # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
39
- # :into_fields => [:ip, :timestamp, :verb, :url, :version],
40
- # :of => PageView
41
- #
42
- # parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
43
- #
44
- # The option <tt>:strictly</tt> can also be set to force the
45
- # parser to raise an error if it finds a line which doesn't match
46
- # its regexp.
47
- class RegexpParser < LineParser
48
- attr_accessor :regexp, :fields, :strict
49
-
50
- def initialize options={}
51
- @regexp = options[:regexp] || options[:by_regexp]
52
- @fields = options[:fields] || options[:into_fields]
53
- @strict = options[:strict] || options[:strictly]
54
- super options
55
- end
56
-
57
- def parse_line line
58
- match_data = regexp.match(line.chomp)
59
- {}.tap do |hsh|
60
- if match_data
61
- match_data.captures.each_with_index do |capture, index|
62
- hsh[fields[index]] = capture
63
- end
64
- else
65
- raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
66
- end
67
- end
68
- end
69
- end
70
- end
71
- end
72
-
@@ -1,12 +0,0 @@
1
- module IMW
2
-
3
- # A Repository is a collection of datasets. It is used by the
4
- # command-line +imw+ tool.
5
- class Repository < Hash
6
- alias_method :datasets, :values
7
- alias_method :handles, :keys
8
- end
9
-
10
- end
11
-
12
-