imw 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +23 -0
- data/Gemfile.lock +47 -0
- data/LICENSE +20 -674
- data/README.rdoc +3 -4
- data/VERSION +1 -1
- data/lib/imw.rb +64 -35
- data/lib/imw/dataset.rb +12 -2
- data/lib/imw/formats.rb +4 -2
- data/lib/imw/formats/delimited.rb +96 -36
- data/lib/imw/formats/excel.rb +69 -101
- data/lib/imw/formats/json.rb +3 -5
- data/lib/imw/formats/pdf.rb +71 -0
- data/lib/imw/formats/yaml.rb +3 -5
- data/lib/imw/metadata.rb +66 -0
- data/lib/imw/metadata/contains_metadata.rb +44 -0
- data/lib/imw/metadata/dsl.rb +111 -0
- data/lib/imw/metadata/field.rb +65 -0
- data/lib/imw/metadata/schema.rb +227 -0
- data/lib/imw/metadata/schematized.rb +27 -0
- data/lib/imw/parsers.rb +1 -0
- data/lib/imw/parsers/flat.rb +44 -0
- data/lib/imw/resource.rb +36 -224
- data/lib/imw/schemes.rb +3 -1
- data/lib/imw/schemes/hdfs.rb +12 -1
- data/lib/imw/schemes/http.rb +1 -2
- data/lib/imw/schemes/local.rb +139 -16
- data/lib/imw/schemes/remote.rb +14 -9
- data/lib/imw/schemes/s3.rb +12 -0
- data/lib/imw/schemes/sql.rb +117 -0
- data/lib/imw/tools.rb +5 -3
- data/lib/imw/tools/downloader.rb +63 -0
- data/lib/imw/tools/summarizer.rb +21 -10
- data/lib/imw/utils.rb +10 -0
- data/lib/imw/utils/dynamically_extendable.rb +137 -0
- data/lib/imw/utils/error.rb +3 -0
- data/lib/imw/utils/extensions.rb +0 -4
- data/lib/imw/utils/extensions/array.rb +6 -7
- data/lib/imw/utils/extensions/hash.rb +3 -5
- data/lib/imw/utils/extensions/string.rb +3 -3
- data/lib/imw/utils/has_uri.rb +114 -0
- data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
- data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +1 -0
- data/spec/data/formats/none/sample +650 -0
- data/spec/data/formats/sgml/sample.xml +617 -0
- data/spec/data/formats/text/sample.txt +650 -0
- data/spec/data/formats/yaml/sample.yaml +410 -0
- data/spec/data/schema-tabular.yaml +11 -0
- data/spec/imw/formats/delimited_spec.rb +34 -2
- data/spec/imw/formats/excel_spec.rb +55 -0
- data/spec/imw/formats/json_spec.rb +3 -3
- data/spec/imw/formats/sgml_spec.rb +4 -4
- data/spec/imw/formats/yaml_spec.rb +3 -3
- data/spec/imw/metadata/field_spec.rb +26 -0
- data/spec/imw/metadata/schema_spec.rb +27 -0
- data/spec/imw/metadata_spec.rb +39 -0
- data/spec/imw/parsers/line_parser_spec.rb +1 -1
- data/spec/imw/resource_spec.rb +0 -100
- data/spec/imw/schemes/hdfs_spec.rb +19 -13
- data/spec/imw/schemes/local_spec.rb +59 -3
- data/spec/imw/schemes/s3_spec.rb +4 -0
- data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
- data/spec/imw/utils/has_uri_spec.rb +55 -0
- data/spec/spec_helper.rb +1 -2
- data/spec/support/random.rb +4 -4
- metadata +58 -17
- data/CHANGELOG +0 -0
- data/TODO +0 -18
- data/spec/data/sample.json +0 -782
- data/spec/data/sample.txt +0 -131
- data/spec/data/sample.xml +0 -653
- data/spec/data/sample.yaml +0 -651
- data/spec/spec.opts +0 -4
- data/spec/support/extensions.rb +0 -18
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
class Metadata
|
|
3
|
+
module Schematized
|
|
4
|
+
|
|
5
|
+
# The schema for this object.
|
|
6
|
+
#
|
|
7
|
+
# @return [IMW::Metadata::Schema, nil]
|
|
8
|
+
def schema
|
|
9
|
+
@schema
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Set a new schema for this object.
|
|
13
|
+
#
|
|
14
|
+
# Will call the object's +validate_schema!+ hook which should
|
|
15
|
+
# check the record and take the appropriate action if it's
|
|
16
|
+
# invalid.
|
|
17
|
+
#
|
|
18
|
+
# @param [Array, IMW::Metadata::Schema] new_schema
|
|
19
|
+
# @return [IMW::Metadata::Schema]
|
|
20
|
+
def schema= new_schema
|
|
21
|
+
@schema = IMW::Metadata::Schema.new(new_schema)
|
|
22
|
+
validate_schema! if respond_to?(:validate_schema!)
|
|
23
|
+
@schema
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
data/lib/imw/parsers.rb
CHANGED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Parsers
|
|
3
|
+
|
|
4
|
+
class Flat
|
|
5
|
+
|
|
6
|
+
attr_accessor :io
|
|
7
|
+
attr_accessor :state
|
|
8
|
+
attr_accessor :accumulated
|
|
9
|
+
attr_accessor :current
|
|
10
|
+
|
|
11
|
+
def initialize io
|
|
12
|
+
self.io = io
|
|
13
|
+
self.state = nil
|
|
14
|
+
self.accumulated = []
|
|
15
|
+
self.current = nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def read_next!
|
|
19
|
+
self.current = io.readline.chomp
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def parse!
|
|
23
|
+
while (! complete?)
|
|
24
|
+
read_next!
|
|
25
|
+
react_to_input!
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def accumulate!
|
|
30
|
+
self.accumulated << current
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def complete?
|
|
34
|
+
io.eof?
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def react_to_input!
|
|
38
|
+
raise IMW::NotImplementedError.new("Override the `react_to_input!' method of the #{self.class} class")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
data/lib/imw/resource.rb
CHANGED
|
@@ -1,36 +1,7 @@
|
|
|
1
|
-
require '
|
|
1
|
+
require 'imw/utils/has_uri'
|
|
2
2
|
|
|
3
3
|
module IMW
|
|
4
4
|
|
|
5
|
-
# Define this constant in your configuration file to add your own
|
|
6
|
-
# URI handlers to IMW.
|
|
7
|
-
USER_DEFINED_HANDLERS = [] unless defined?(USER_DEFINED_HANDLERS)
|
|
8
|
-
|
|
9
|
-
# Register a new resource handler which dynamically extends a new
|
|
10
|
-
# IMW::Resource with the given module +mod+.
|
|
11
|
-
#
|
|
12
|
-
# +handler+ must be one of
|
|
13
|
-
#
|
|
14
|
-
# 1. Regexp
|
|
15
|
-
# 2. Proc
|
|
16
|
-
# 3. +true+
|
|
17
|
-
#
|
|
18
|
-
# In case (1), if the regular expression matches the resource's URI
|
|
19
|
-
# then the module (+mod+) will be used to extend the resource.
|
|
20
|
-
#
|
|
21
|
-
# In case (2), if the Proc returns a value other than +false+ or
|
|
22
|
-
# +nil+ then the module will be used.
|
|
23
|
-
#
|
|
24
|
-
# In case (3), the module will be used.
|
|
25
|
-
#
|
|
26
|
-
# @param [String, Module] mod
|
|
27
|
-
# @param [Regexp, Proc, true] handler
|
|
28
|
-
def self.register_handler mod, handler
|
|
29
|
-
raise IMW::ArgumentError.new("Module must be either a Module or String") unless mod.is_a?(Module) || mod.is_a?(String)
|
|
30
|
-
raise IMW::ArgumentError.new("Handler must be either a Regexp, Proc, or true") unless handler.is_a?(Regexp) || handler.is_a?(Proc) || handler == true
|
|
31
|
-
self::USER_DEFINED_HANDLERS << [mod, handler]
|
|
32
|
-
end
|
|
33
|
-
|
|
34
5
|
# A resource can be anything addressable via a URI. Examples
|
|
35
6
|
# include local files, remote files, webpages, &c.
|
|
36
7
|
#
|
|
@@ -54,7 +25,7 @@ module IMW
|
|
|
54
25
|
# The modules extending a particular IMW::Resource instance can be
|
|
55
26
|
# listed as follows
|
|
56
27
|
#
|
|
57
|
-
# my_archive.
|
|
28
|
+
# my_archive.modules #=> [IMW::Local::Base, IMW::Local::File, IMW::Local::Compressible, IMW::Archives::Tarbz2]
|
|
58
29
|
#
|
|
59
30
|
# By default, resources are opened for reading. Passing in the
|
|
60
31
|
# appropriate <tt>:mode</tt> option changes this:
|
|
@@ -74,9 +45,6 @@ module IMW
|
|
|
74
45
|
# accepts all the same arguments as IMW::Resource.new.
|
|
75
46
|
class Resource
|
|
76
47
|
|
|
77
|
-
# The URI object associated with this resource.
|
|
78
|
-
attr_reader :uri
|
|
79
|
-
|
|
80
48
|
# The mode in which to access this resource.
|
|
81
49
|
attr_accessor :mode
|
|
82
50
|
|
|
@@ -85,142 +53,69 @@ module IMW
|
|
|
85
53
|
|
|
86
54
|
# Create a new resource representing +uri+.
|
|
87
55
|
#
|
|
88
|
-
# IMW will automatically extend the resulting IMW::
|
|
89
|
-
# instance with modules appropriate
|
|
56
|
+
# IMW will automatically extend the resulting IMW::Resource
|
|
57
|
+
# instance with modules appropriate for the given URI:
|
|
90
58
|
#
|
|
91
59
|
# r = IMW::Resource.new("http://www.infochimps.com")
|
|
92
|
-
# r.
|
|
60
|
+
# r.modules
|
|
93
61
|
# => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
|
94
62
|
#
|
|
95
63
|
# You can prevent this altogether by passing in
|
|
96
64
|
# <tt>:no_modules</tt>:
|
|
97
65
|
#
|
|
98
|
-
# r = IMW::Resource.new("http://www.infochimps.com")
|
|
99
|
-
# r.
|
|
100
|
-
# => [
|
|
66
|
+
# r = IMW::Resource.new("http://www.infochimps.com", :no_modules => true)
|
|
67
|
+
# r.modules
|
|
68
|
+
# => []
|
|
101
69
|
#
|
|
102
70
|
# And you can exert more fine-grained control with the
|
|
103
71
|
# <tt>:use_modules</tt> and <tt>:skip_modules</tt> options, see
|
|
104
|
-
# IMW::Resource.
|
|
72
|
+
# IMW::Resource.extend_instance! for details.
|
|
105
73
|
#
|
|
106
74
|
# @param [String, Addressable::URI] uri
|
|
107
75
|
# @param [Hash] options
|
|
108
76
|
# @option options [true, false] no_modules
|
|
109
77
|
# @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
|
|
78
|
+
# @option options [IMW::Metadata::Record, Array] schema the schema of this resource
|
|
110
79
|
# @return [IMW::Resource]
|
|
111
80
|
def initialize uri, options={}
|
|
112
81
|
self.uri = uri
|
|
113
82
|
self.resource_options = options
|
|
114
83
|
self.mode = options[:mode] || 'r'
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
# Return the modules this resource has been extended by.
|
|
119
|
-
#
|
|
120
|
-
# @return [Array] the modules this resource has been extended by.
|
|
121
|
-
def resource_modules
|
|
122
|
-
@resource_modules ||= []
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
# Works just like Object#extend except it keeps track of the
|
|
126
|
-
# modules it has extended, see Resource#resource_modules.
|
|
127
|
-
def extend mod
|
|
128
|
-
resource_modules << mod
|
|
129
|
-
super mod
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
# Extend this resource with modules by passing it through a
|
|
133
|
-
# collection of handlers defined by IMW::Resource.handlers.
|
|
134
|
-
#
|
|
135
|
-
# Accepts the same options as Resource.extend_resource!.
|
|
136
|
-
def extend_appropriately! options={}
|
|
137
|
-
self.class.extend_resource!(self, options)
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
# Set the URI of this resource by parsing the given +uri+ (if
|
|
141
|
-
# necessary).
|
|
142
|
-
#
|
|
143
|
-
# @param [String, Addressable::URI] uri the uri to parse
|
|
144
|
-
def uri= uri
|
|
145
|
-
if uri.is_a?(Addressable::URI)
|
|
146
|
-
@uri = uri
|
|
147
|
-
else
|
|
148
|
-
begin
|
|
149
|
-
@uri = Addressable::URI.parse(uri.to_s)
|
|
150
|
-
rescue URI::InvalidURIError
|
|
151
|
-
@uri = Addressable::URI.parse(URI.encode(uri.to_s))
|
|
152
|
-
@encoded_uri = true
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# The scheme of this resource. Will be +nil+ for local resources.
|
|
158
|
-
#
|
|
159
|
-
# @return [String]
|
|
160
|
-
def scheme
|
|
161
|
-
@scheme ||= uri.scheme
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# The directory name of this resource's path.
|
|
165
|
-
#
|
|
166
|
-
# @return [String]
|
|
167
|
-
def dirname
|
|
168
|
-
@dirname ||= File.dirname(path)
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
# The basename of this resource's path.
|
|
172
|
-
#
|
|
173
|
-
# @return [String]
|
|
174
|
-
def basename
|
|
175
|
-
@basename ||= File.basename(path)
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Returns the extension (INCLUDING the '.') of this resource's
|
|
179
|
-
# path. Redefine this in an including class for which this is
|
|
180
|
-
# weird ('.tar.gz' I'm talking to you...)
|
|
181
|
-
#
|
|
182
|
-
# @return [String]
|
|
183
|
-
def extname
|
|
184
|
-
@extname ||= File.extname(path)
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
# Returns the extension (WITHOUT the '.') of this resource's path.
|
|
188
|
-
#
|
|
189
|
-
# @return [String]
|
|
190
|
-
def extension
|
|
191
|
-
@extension ||= extname[1..-1] || ''
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
# Returns the basename of the file with its extension removed
|
|
195
|
-
#
|
|
196
|
-
# IMW.open('/path/to/some_file.tar.gz').name # => some_file
|
|
197
|
-
#
|
|
198
|
-
# @return [String]
|
|
199
|
-
def name
|
|
200
|
-
@name ||= extname ? basename[0,basename.length - extname.length] : basename
|
|
84
|
+
self.schema = options[:schema] if options[:schema]
|
|
85
|
+
extend_appropriately!(options)
|
|
201
86
|
end
|
|
202
87
|
|
|
203
|
-
#
|
|
204
|
-
|
|
205
|
-
# @return [String]
|
|
206
|
-
def user
|
|
207
|
-
@user ||= uri.user
|
|
208
|
-
end
|
|
88
|
+
# Provides resources with a wrapped Addressable::URI object.
|
|
89
|
+
include IMW::Utils::HasURI
|
|
209
90
|
|
|
210
|
-
|
|
211
|
-
|
|
91
|
+
# Provides resources with a schema.
|
|
92
|
+
include IMW::Metadata::Schematized
|
|
93
|
+
|
|
94
|
+
# Gives IMW::Resource instances with the ability to dynamically
|
|
95
|
+
# extend themselves with modules chosen from a set of handlers
|
|
96
|
+
# stored by the IMW::Resource class.
|
|
97
|
+
include IMW::Utils::DynamicallyExtendable
|
|
98
|
+
[IMW::Schemes::HANDLERS, IMW::CompressedFiles::HANDLERS, IMW::Archives::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
|
|
99
|
+
register_handlers *handlers
|
|
212
100
|
end
|
|
213
|
-
|
|
101
|
+
|
|
214
102
|
# Raise an error unless this resource exists.
|
|
215
103
|
#
|
|
216
104
|
# @param [String] message an optional message to include
|
|
217
105
|
def should_exist!(message=nil)
|
|
218
|
-
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{
|
|
219
|
-
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{
|
|
220
|
-
raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', '))
|
|
106
|
+
raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
|
|
107
|
+
raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
|
|
108
|
+
raise IMW::PathError.new([message, "#{path} does not exist"].compact.join(', ')) unless exist?
|
|
221
109
|
self
|
|
222
110
|
end
|
|
223
111
|
|
|
112
|
+
# Close this resource.
|
|
113
|
+
#
|
|
114
|
+
# Modules should hook into super() as they need to redefine this
|
|
115
|
+
# method.
|
|
116
|
+
def close
|
|
117
|
+
end
|
|
118
|
+
|
|
224
119
|
# Open a copy of this resource.
|
|
225
120
|
#
|
|
226
121
|
# This is useful when wanting to reset file handles. Though -- be
|
|
@@ -228,7 +123,7 @@ module IMW
|
|
|
228
123
|
#
|
|
229
124
|
# @return [IMW::Resource] the new (old) resource
|
|
230
125
|
def reopen
|
|
231
|
-
IMW.open(
|
|
126
|
+
IMW.open(uri.to_s)
|
|
232
127
|
end
|
|
233
128
|
|
|
234
129
|
# If +method+ begins with the strings +is+, +on+, or +via+ and
|
|
@@ -257,92 +152,9 @@ module IMW
|
|
|
257
152
|
# querying for a boolean response so answer false
|
|
258
153
|
return false
|
|
259
154
|
else
|
|
260
|
-
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{
|
|
155
|
+
raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{modules.join(', ')}"
|
|
261
156
|
end
|
|
262
157
|
end
|
|
263
158
|
|
|
264
|
-
# Iterate through IMW::Resource.handlers and extend the given
|
|
265
|
-
# +resource+ with modules whose handler conditions match the
|
|
266
|
-
# resource.
|
|
267
|
-
#
|
|
268
|
-
# Passing in <tt>:use_modules</tt> or <tt>:skip_modules</tt>
|
|
269
|
-
# allows overriding the default behavior of handlers.
|
|
270
|
-
#
|
|
271
|
-
# @param [IMW::Resource] resource the resource to extend
|
|
272
|
-
# @param [Hash] options
|
|
273
|
-
# @option options [Array<String,Module>] use_modules a list of modules used regardless of handlers
|
|
274
|
-
# @option options [Array<String,Module>] skip_modules a list of modules not to be used regardless of handlers
|
|
275
|
-
# @return [IMW::Resource] the extended resource
|
|
276
|
-
def self.extend_resource! resource, options={}
|
|
277
|
-
options.reverse_merge!(:use_modules => [], :skip_modules => [])
|
|
278
|
-
handlers.each do |mod_name, handler|
|
|
279
|
-
case handler
|
|
280
|
-
when Regexp then extend_resource_with_mod_or_string!(resource, mod_name, options[:skip_modules]) if handler =~ resource.uri.to_s
|
|
281
|
-
when Proc then extend_resource_with_mod_or_string!(resource, mod_name, options[:skip_modules]) if handler.call(resource)
|
|
282
|
-
when TrueClass then extend_resource_with_mod_or_string!(resource, mod_name, options[:skip_modules])
|
|
283
|
-
else
|
|
284
|
-
raise IMW::TypeError("A handler must be Regexp, Proc, or true")
|
|
285
|
-
end
|
|
286
|
-
end
|
|
287
|
-
options[:use_modules].each { |mod_name| extend_resource_with_mod_or_string!(resource, mod_name, options[:skip_modules]) }
|
|
288
|
-
resource
|
|
289
|
-
end
|
|
290
|
-
|
|
291
|
-
# A list of handlers to match against each new resource.
|
|
292
|
-
#
|
|
293
|
-
# When an IMW::Resource is instantiated it eventually calls
|
|
294
|
-
# IMW::Resource.extend_resource! which will iterate through the
|
|
295
|
-
# handlers in IMW::Resource.handlers, extending the resource with
|
|
296
|
-
# modules whose handler conditions are satisfied.
|
|
297
|
-
#
|
|
298
|
-
# A handler is just an Array with two elements. The first should be
|
|
299
|
-
# a module or a string identifying a module.
|
|
300
|
-
#
|
|
301
|
-
# If the second element is a Regexp, the corresponding module will
|
|
302
|
-
# be used if the regexp matches the resource's URI (as a string)
|
|
303
|
-
#
|
|
304
|
-
# If the second element is a Proc, it will be called with the
|
|
305
|
-
# resource as its only argument and if it returns true then the
|
|
306
|
-
# module will be used.
|
|
307
|
-
#
|
|
308
|
-
# You can define your own handlers by appending them to
|
|
309
|
-
# IMW::Resource::USER_DEFINED_HANDLERS in your <tt>.imwrc</tt>
|
|
310
|
-
# file.
|
|
311
|
-
#
|
|
312
|
-
# The order in which handlers appear is significant --
|
|
313
|
-
# IMW::CompressedFiles::HANDLERS must be _before_
|
|
314
|
-
# IMW::Archives::HANDLERS, for example, because of (say)
|
|
315
|
-
# <tt>.tar.bz2</tt> files.
|
|
316
|
-
#
|
|
317
|
-
# @return [Array]
|
|
318
|
-
def self.handlers
|
|
319
|
-
# order is important!
|
|
320
|
-
#
|
|
321
|
-
#
|
|
322
|
-
#
|
|
323
|
-
#CompressedFiles must come before
|
|
324
|
-
# Archives because of tar.bz2 type files
|
|
325
|
-
IMW::Schemes::HANDLERS + IMW::CompressedFiles::HANDLERS + IMW::Archives::HANDLERS + IMW::Formats::HANDLERS + USER_DEFINED_HANDLERS
|
|
326
|
-
end
|
|
327
|
-
|
|
328
|
-
protected
|
|
329
|
-
# Extend +resource+ with +mod_or_string+. Will work hard to try
|
|
330
|
-
# and interpret +mod_or_string+ as a module if it's a string.
|
|
331
|
-
#
|
|
332
|
-
# @param [IMW::Resource] resource the resource to extend
|
|
333
|
-
#
|
|
334
|
-
# @param [Module, String] mod_or_string the module or string
|
|
335
|
-
# representing a module to extend the resource with
|
|
336
|
-
#
|
|
337
|
-
# @param [Array<Module,String>] skip_modules modules to exclude
|
|
338
|
-
def self.extend_resource_with_mod_or_string! resource, mod_or_string, skip_modules
|
|
339
|
-
return if skip_modules.include?(mod_or_string)
|
|
340
|
-
if mod_or_string.is_a?(Module)
|
|
341
|
-
resource.extend(mod_or_string)
|
|
342
|
-
else
|
|
343
|
-
m = IMW.class_eval(mod_or_string)
|
|
344
|
-
resource.extend(m) unless skip_modules.include?(m)
|
|
345
|
-
end
|
|
346
|
-
end
|
|
347
159
|
end
|
|
348
160
|
end
|
data/lib/imw/schemes.rb
CHANGED
|
@@ -6,6 +6,7 @@ module IMW
|
|
|
6
6
|
autoload :HTTP, 'imw/schemes/http'
|
|
7
7
|
autoload :HTTPS, 'imw/schemes/http'
|
|
8
8
|
autoload :HDFS, 'imw/schemes/hdfs'
|
|
9
|
+
autoload :SQL, 'imw/schemes/sql'
|
|
9
10
|
|
|
10
11
|
HANDLERS = [
|
|
11
12
|
["Schemes::Local::Base", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
|
|
@@ -13,7 +14,8 @@ module IMW
|
|
|
13
14
|
["Schemes::S3", %r{^s3://}i ],
|
|
14
15
|
["Schemes::HTTP", %r{^http://}i ],
|
|
15
16
|
["Schemes::HTTPS", %r{^https://}i ],
|
|
16
|
-
["Schemes::HDFS", %r{^hdfs://}i ]
|
|
17
|
+
["Schemes::HDFS", %r{^hdfs://}i ],
|
|
18
|
+
["Schemes::SQL::Base", %r{^\w+sql://}i ]
|
|
17
19
|
]
|
|
18
20
|
end
|
|
19
21
|
end
|
data/lib/imw/schemes/hdfs.rb
CHANGED
|
@@ -234,7 +234,18 @@ module IMW
|
|
|
234
234
|
def resources
|
|
235
235
|
contents.map { |path| IMW.open(path) }
|
|
236
236
|
end
|
|
237
|
-
|
|
237
|
+
|
|
238
|
+
# Return the resource at the base path of this resource joined
|
|
239
|
+
# to +path+.
|
|
240
|
+
#
|
|
241
|
+
# IMW.open('hdfs:///path/to/dir').join('subdir')
|
|
242
|
+
# #=> IMW::Resource at 'hdfs:///path/to/dir/subdir'
|
|
243
|
+
#
|
|
244
|
+
# @param [Array<String>] paths
|
|
245
|
+
# @return [IMW::Resource]
|
|
246
|
+
def join *paths
|
|
247
|
+
IMW.open(File.join(stripped_uri.to_s, *paths))
|
|
248
|
+
end
|
|
238
249
|
end
|
|
239
250
|
end
|
|
240
251
|
end
|