imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,289 +0,0 @@
1
- require 'imw/utils/extensions/hpricot'
2
-
3
- module IMW
4
- module Parsers
5
- module HtmlMatchers
6
-
7
- # An abstract class from which to subclass specific HTML matchers.
8
- #
9
- # A subclass is initialized with a +selector+ and an optional
10
- # +matcher+. The +selector+ is an HTML path specification used to
11
- # collect elements from the document. If initialized with a
12
- # +matcher+, the +matcher+ is used to return match information
13
- # from the elements; else the inner HTML is returned. Subclasses
14
- # decide how the +selector+ will collect elements.
15
- class Matcher
16
-
17
- attr_accessor :selector
18
- attr_accessor :matcher
19
- attr_accessor :options
20
-
21
- def initialize selector, matcher=nil, options={}
22
- self.selector = selector
23
- self.matcher = matcher
24
- self.options = options
25
- end
26
-
27
- def match doc
28
- raise "Abstract class #{self.class}"
29
- end
30
-
31
- end
32
-
33
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
34
- # for matching against the first element of a document matching a
35
- # selector.
36
- class MatchFirstElement < Matcher
37
- # Grab the first element from +doc+ matching the +selector+ this
38
- # class was initialized with. If initialized with a +matcher+,
39
- # then return the +matcher+'s match against the first element,
40
- # else just return the inner HTML of the first element.
41
- #
42
- # m = MatchFirstElement.new('span#bio/a.homepage')
43
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
44
- # # => 'My Homepage'
45
- def match doc
46
- doc = Hpricot(doc) if doc.is_a?(String)
47
- el = doc.at(selector) or return nil
48
- if matcher
49
- matcher.match(el)
50
- else
51
- options[:html] ? el : el.inner_text.strip
52
- end
53
- end
54
- end
55
-
56
- # FIXME is there really a need for this separate class? why can't
57
- # MatchFirstElement.match accept a block?
58
- class MatchProc < MatchFirstElement
59
- attr_accessor :proc
60
- attr_accessor :options
61
- def initialize selector, proc, matcher=nil, options={}
62
- super selector, matcher
63
- self.options = options
64
- self.proc = proc
65
- end
66
- def match doc
67
- val = super doc
68
- val ? self.proc.call(val) : self.proc.call(doc)
69
- end
70
- end
71
-
72
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
73
- # for matching each element of a document matching a selector.
74
- class MatchArray < Matcher
75
- # Grab each element from +doc+ matching the +selector+ this
76
- # class was initialized with. If initialized with a +matcher+,
77
- # then return an array consisting of the +matcher+'s match
78
- # against each element, else just return an array consisting of
79
- # the inner HTML of each element.
80
- #
81
- # m = MatchArray.new('span#bio/a.homepage')
82
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>
83
- # <span id="bio"><a class="homepage" href="http://foo.baz">Your Homepage</a></span>
84
- # <span id="bio"><a class="homepage" href="http://foo.qux">Their Homepage</a></span>')
85
- # # => ["My Homepage", "Your Homepage", "Their Homepage"]
86
- def match doc
87
- doc = Hpricot(doc) if doc.is_a?(String)
88
- subdoc = (doc/selector) or return nil
89
- if matcher
90
- subdoc.map{|el| matcher.match(el)}
91
- else
92
- if options[:html]
93
- subdoc.map{|el| el }
94
- else
95
- subdoc.map{|el| el.inner_text.strip }
96
- end
97
- end
98
- end
99
- end
100
-
101
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
102
- # for matching an attribute of the first element of a document
103
- # matching a selector.
104
- class MatchAttribute < Matcher
105
-
106
- attr_accessor :attribute
107
-
108
- # Unlike <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>,
109
- # <tt>IMW::Parsers::HtmlMatchers::MatchAttribute</tt> is initialized
110
- # with three arguments: the +selector+ which collects elements
111
- # from an HTML document, an +attribute+ to extract, and
112
- # (optionally) a +matcher+ to perform the matching.
113
- def initialize selector, attribute, matcher=nil
114
- super selector, matcher
115
- self.attribute = attribute.to_s
116
- end
117
-
118
- # Grab the first element from +doc+ matching the +selector+ this
119
- # class was initialized with. If initialized with a +matcher+,
120
- # then return the +matcher+'s match against the value of the
121
- # +attribute+ this class was initialized with, else just return
122
- # the value of the +attribute+.
123
- #
124
- # m = MatchAttribute.new('span#bio/a.homepage', 'href')
125
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">My Homepage</a></span>')
126
- # # => 'http://foo.bar'
127
- def match doc
128
- doc = Hpricot(doc) if doc.is_a?(String)
129
- val = doc.path_attr(selector, attribute)
130
- matcher ? matcher.match(val) : val
131
- end
132
- end
133
-
134
- # Concrete subclass of <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>
135
- # for using a regular expression to match against text in an HTML
136
- # document.
137
- class MatchRegexp < Matcher
138
-
139
- attr_accessor :re
140
- attr_accessor :options
141
-
142
- # Use the regular expression +re+ to return captures from the
143
- # elements collected by +selector+ (treated as text) used on an
144
- # HTML document (if +selector+ is +nil+ then match against the
145
- # full text of the document). If the keyword argument
146
- # <tt>:capture</tt> is specified then return the corresponding
147
- # group (indexing is that of regular expressions; "1" is the
148
- # first capture), else return an array of all captures. If
149
- # +matcher+, then use it on the capture(s) before returning.
150
- #
151
- # FIXME Shouldn't the matcher come BEFORE the regexp capture,
152
- # not after?
153
- def initialize selector, re, matcher=nil, options={}
154
- super selector, matcher
155
- self.options = options
156
- self.re = re
157
- end
158
-
159
- # Grab the first element from +doc+ matching the +selector+ this
160
- # object was initialized with. Use the +re+ and the (optional)
161
- # capture group this object was initialized with to capture a
162
- # string (or array of strings if no capture group was specified)
163
- # from the collected element (treated as text). If initialized
164
- # with a +matcher+, then return the +matcher+'s match against
165
- # the value of the capture(s), else just return the capture(s).
166
- #
167
- # m = MatchRegexp.new('span#bio/a.homepage', /Homepage of (.*)$/, nil, :capture => 1 )
168
- # m.match('<span id="bio"><a class="homepage" href="http://foo.bar">Homepage of John Chimpo</a></span>')
169
- # # => "John Chimpo"
170
- def match doc
171
- doc = Hpricot(doc) if doc.is_a?(String)
172
- el = selector ? doc.contents_of(selector) : doc
173
- m = re.match(el.to_s)
174
- val = case
175
- when m.nil? then nil
176
- when self.options.key?(:capture) then m.captures[self.options[:capture] - 1] # -1 to match regexp indexing
177
- else m.captures
178
- end
179
- # pass to matcher, if any
180
- matcher ? matcher.match(val) : val
181
- end
182
- end
183
-
184
-
185
- class MatchRegexpRepeatedly < Matcher
186
- attr_accessor :re
187
- def initialize selector, re, matcher=nil
188
- super selector, matcher
189
- self.re = re
190
- end
191
- def match doc
192
- doc = Hpricot(doc) if doc.is_a?(String)
193
- # apply selector, if any
194
- el = selector ? doc.contents_of(selector) : doc
195
- return unless el
196
- # get all matches
197
- val = el.to_s.scan(re)
198
- # if there's only one capture group, flatten the array
199
- val = val.flatten if val.first && val.first.length == 1
200
- # pass to matcher, if any
201
- matcher ? matcher.match(val) : val
202
- end
203
- end
204
-
205
- # Class for building a hash of values by using appropriate
206
- # matchers against an HTML document.
207
- class MatchHash
208
-
209
- attr_accessor :match_hash
210
-
211
- # The +match_hash+ must be a +Hash+ of symbols matched to HTML
212
- # matchers (subclasses of
213
- # <tt>IMW::Parsers::HtmlMatchers::Matcher</tt>).
214
- def initialize match_hash
215
- # Kludge? maybe.
216
- raise "MatchHash requires a hash of :attributes => matchers." unless match_hash.is_a?(Hash)
217
- self.match_hash = match_hash
218
- end
219
-
220
- # Use the +match_hash+ this +MatchHash+ was initialized with to
221
- # select elements from +doc+ and extract information from them:
222
- #
223
- # m = MatchHash.new({
224
- # :name => MatchFirstElement.new('li/span.customer'),
225
- # :order_status => MatchAttribute.new('li/ul[@status]','status'),
226
- # :products => MatchArray.new('li/ul/li')
227
- # })
228
- # m.match('<li><span class="customer">John Chimpo</span>
229
- # <ul status="shipped">
230
- # <li>bananas</li>
231
- # <li>mangos</li>
232
- # <li>banangos</li>
233
- # </ul></li>')
234
- # # => {
235
- # :name => "John Chimpo",
236
- # :order_status => "shipped",
237
- # :products => ["bananas", "mangos", "banangos"]
238
- # }
239
- def match doc
240
- doc = Hpricot(doc) if doc.is_a?(String)
241
- hsh = { }
242
- match_hash.each do |attr, m|
243
- val = m.match(doc)
244
- case attr
245
- when Array then hsh.merge!(Hash.zip(attr, val).reject{|k,v| v.nil? }) if val
246
- else hsh[attr] = val end
247
- end
248
- self.class.scrub!(hsh)
249
- end
250
-
251
- # kill off keys with nil values
252
- def self.scrub! hsh
253
- hsh # .reject{|k,v| v.nil? }
254
- end
255
- end
256
-
257
- #
258
- # construct the downstream part of a hash matcher
259
- #
260
- def self.build_match_hash spec_hash
261
- hsh = { }
262
- spec_hash.each do |attr, spec|
263
- hsh[attr] = build_parse_tree(spec)
264
- end
265
- hsh
266
- end
267
-
268
- #
269
- # recursively build a tree of matchers
270
- #
271
- def self.build_parse_tree spec
272
- case spec
273
- when nil then nil
274
- when Matcher then spec
275
- when Hash then MatchHash.new(build_match_hash(spec))
276
- when Array then
277
- return nil if spec.empty?
278
- raise "Array spec must be a single selector or a selector and another match specification" unless (spec.length <= 2)
279
- MatchArray.new(spec[0].to_s, build_parse_tree(spec[1]))
280
- when String then MatchFirstElement.new(spec)
281
- when Proc then MatchProc.new(nil, spec)
282
- when Regexp then MatchRegexp.new(nil, spec, nil, :capture => 1)
283
- when Symbol then MatchAttribute.new(nil, spec, nil)
284
- else raise "Don't know how to parse #{spec.inspect}"
285
- end
286
- end
287
- end
288
- end
289
- end
@@ -1,87 +0,0 @@
1
- module IMW
2
- module Parsers
3
-
4
- # This is an abstract class for a line-oriented parser intended to
5
- # read and emit lines sequentially from a file.
6
- #
7
- # To leverage the functionality of this class, subclass it and
8
- # define a +parse_line+ method.
9
- class LineParser
10
-
11
- # The number of lines to skip on each file parsed.
12
- attr_accessor :skip_first
13
-
14
- # The class to parse each line into. The +new+ method of this
15
- # class must accept a hash.
16
- attr_accessor :klass
17
-
18
- # If called with the option <tt>:skip_first</tt> then skip the
19
- # corresponding number of lines at the beginning of the file when
20
- # parsing.
21
- def initialize options={}
22
- @skip_first = options[:skip_first] || 0
23
- @klass = options[:of] || options[:klass]
24
- end
25
-
26
- # Parse the given file. If the option <tt>:lines</tt> is passed
27
- # in then only parse that many lines. If given a block then
28
- # yield the result of each line to the block; else just return
29
- # an array of results.
30
- #
31
- # If this parser has a +klass+ attribute then each parsed line
32
- # will first be turned into an instance of that class (the class
33
- # must accept a hash of values in its initializer).
34
- def parse! file, options={}, &block
35
- skip_lines!(file)
36
- if options[:lines]
37
- case
38
- when klass && block_given?
39
- options[:lines].times do
40
- yield klass.new(parse_line(file.readline))
41
- end
42
- when block_given?
43
- options[:lines].times do
44
- yield parse_line(file.readline)
45
- end
46
- when klass
47
- options[:lines].times do
48
- klass.new(parse_line(file.readline))
49
- end
50
- else
51
- options[:lines].times.map do
52
- parse_line(file.readline)
53
- end
54
- end
55
- else
56
- case
57
- when klass && block_given?
58
- file.each do |line|
59
- yield klass.new(parse_line(line))
60
- end
61
- when block_given?
62
- file.each do |line|
63
- yield parse_line(line)
64
- end
65
- when klass
66
- file.map do |line|
67
- klass.new(parse_line(line))
68
- end
69
- else
70
- file.map do |line|
71
- parse_line(line)
72
- end
73
- end
74
- end
75
- end
76
-
77
- def parse_line line
78
- raise IMW::NotImplementedError.new("Subclass the LineParser and redefine this method to create a true parser.")
79
- end
80
-
81
- protected
82
- def skip_lines! file
83
- skip_first.times { file.readline }
84
- end
85
- end
86
- end
87
- end
@@ -1,72 +0,0 @@
1
- require 'imw/parsers/line_parser'
2
-
3
- module IMW
4
- module Parsers
5
-
6
- # A RegexpParser is a line-oriented parser which uses a regular
7
- # expression to extract data from a line into either a hash or an
8
- # object obeying hash semantics.
9
- #
10
- # As an example, a flat file with one record per line in the
11
- # following format (this is a simplified version of common
12
- # webserver log formats)
13
- #
14
- # 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0
15
- # 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0
16
- # 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1
17
- # ...
18
- #
19
- # could be parsed as follows
20
- #
21
- # file = File.new '/path/to/file.log'
22
- # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
23
- # :into_fields => [:ip, :timestamp, :verb, :url, :version]
24
- # parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ]
25
- #
26
- # Consecutive captures from the regular expression will be pushed
27
- # into a hash with keys given by the +into_fields+ property of
28
- # this parser.
29
- #
30
- # If the parser is instantiated with the <tt>:of</tt> keyword then
31
- # the parsed hash from each line is used to instantiate a new
32
- # object of the corresponding class:
33
- #
34
- # require 'ostruct'
35
- #
36
- # PageView = Class.new(OpenStruct)
37
- #
38
- # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$},
39
- # :into_fields => [:ip, :timestamp, :verb, :url, :version],
40
- # :of => PageView
41
- #
42
- # parser.parse! file #=> [#<PageView ip="151.199.53.145", timestamp="14-Oct-2007:13:34:34-0500", verb="GET", url="/phpmyadmin/main.php", version="1.0">, ... ]
43
- #
44
- # The option <tt>:strictly</tt> can also be set to force the
45
- # parser to raise an error if it finds a line which doesn't match
46
- # its regexp.
47
- class RegexpParser < LineParser
48
- attr_accessor :regexp, :fields, :strict
49
-
50
- def initialize options={}
51
- @regexp = options[:regexp] || options[:by_regexp]
52
- @fields = options[:fields] || options[:into_fields]
53
- @strict = options[:strict] || options[:strictly]
54
- super options
55
- end
56
-
57
- def parse_line line
58
- match_data = regexp.match(line.chomp)
59
- {}.tap do |hsh|
60
- if match_data
61
- match_data.captures.each_with_index do |capture, index|
62
- hsh[fields[index]] = capture
63
- end
64
- else
65
- raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict
66
- end
67
- end
68
- end
69
- end
70
- end
71
- end
72
-
@@ -1,12 +0,0 @@
1
- module IMW
2
-
3
- # A Repository is a collection of datasets. It is used by the
4
- # command-line +imw+ tool.
5
- class Repository < Hash
6
- alias_method :datasets, :values
7
- alias_method :handles, :keys
8
- end
9
-
10
- end
11
-
12
-