imw 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
module Schemes
|
|
4
|
+
|
|
5
|
+
# Defines methods for reading and writing data to/from an
|
|
6
|
+
# HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
|
|
7
|
+
#
|
|
8
|
+
# Learn more about Hadoop[http://hadoop.apache.org] and the
|
|
9
|
+
# {Hadoop Distributed
|
|
10
|
+
# Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
|
|
11
|
+
module HDFS
|
|
12
|
+
|
|
13
|
+
# Checks to see if this is a file or directory
|
|
14
|
+
def self.extended obj
|
|
15
|
+
obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Is this resource an HDFS resource?
|
|
19
|
+
#
|
|
20
|
+
# @return [true, false]
|
|
21
|
+
def on_hdfs?
|
|
22
|
+
true
|
|
23
|
+
end
|
|
24
|
+
alias_method :is_hdfs?, :on_hdfs?
|
|
25
|
+
|
|
26
|
+
# Copy this resource to the +new_uri+.
|
|
27
|
+
#
|
|
28
|
+
# @param [String, IMW::Resource] new_uri
|
|
29
|
+
# @return [IMW::Resource] the new resource
|
|
30
|
+
def cp new_uri
|
|
31
|
+
IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Move this resource to the +new_uri+.
|
|
35
|
+
#
|
|
36
|
+
# @param [String, IMW::Resource] new_uri
|
|
37
|
+
# @return [IMW::Resource] the new resource
|
|
38
|
+
def mv new_uri
|
|
39
|
+
IMW::Transforms::Transferer.new(:mv, self, new_uri).transfer!
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Delete this resource from the HDFS.
|
|
43
|
+
#
|
|
44
|
+
# @option options [true,false] :skip_trash
|
|
45
|
+
def rm options={}
|
|
46
|
+
should_exist!("Cannot delete.")
|
|
47
|
+
args = [:rm]
|
|
48
|
+
args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
|
|
49
|
+
args << path
|
|
50
|
+
HDFS.fs(*args)
|
|
51
|
+
self
|
|
52
|
+
end
|
|
53
|
+
alias_method :rm!, :rm
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Does this path exist on the HDFS?
|
|
57
|
+
#
|
|
58
|
+
# @return [true, false]
|
|
59
|
+
def exist?
|
|
60
|
+
return @exist unless @exist.nil?
|
|
61
|
+
refresh!
|
|
62
|
+
@exist
|
|
63
|
+
end
|
|
64
|
+
alias_method :exists?, :exist?
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Return the size (in bytes) of this resource on the HDFS.
|
|
68
|
+
#
|
|
69
|
+
# This value is cached. Call +refresh+ to refresh the cache
|
|
70
|
+
# manually.
|
|
71
|
+
#
|
|
72
|
+
# @return [Fixnum]
|
|
73
|
+
def size
|
|
74
|
+
return @size unless @size.nil?
|
|
75
|
+
refresh!
|
|
76
|
+
should_exist!("Cannot report size")
|
|
77
|
+
@size
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Return the number of directories contained at or below this
|
|
81
|
+
# path on the HDFS.
|
|
82
|
+
#
|
|
83
|
+
# This value is cached. Call +refresh+ to refresh the cache
|
|
84
|
+
# manually.
|
|
85
|
+
#
|
|
86
|
+
# @return [Fixnum]
|
|
87
|
+
def num_dirs
|
|
88
|
+
return @num_dirs unless @num_dirs.nil?
|
|
89
|
+
refresh!
|
|
90
|
+
should_exist!("Cannot report number of directories.")
|
|
91
|
+
@num_dirs
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Return the number of files contained at or below this path
|
|
95
|
+
# on the HDFS.
|
|
96
|
+
#
|
|
97
|
+
# This value is cached. Call +refresh+ to refresh the cache
|
|
98
|
+
# manually.
|
|
99
|
+
#
|
|
100
|
+
# @return [Fixnum]
|
|
101
|
+
def num_files
|
|
102
|
+
return @num_files unless @num_files.nil?
|
|
103
|
+
refresh!
|
|
104
|
+
should_exist!("Cannot report number of files.")
|
|
105
|
+
@num_files
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Is this resource an HDFS directory?
|
|
109
|
+
#
|
|
110
|
+
# @return [true, false]
|
|
111
|
+
def is_directory?
|
|
112
|
+
exist? && num_dirs > 0
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Refresh the cached file properties.
|
|
116
|
+
#
|
|
117
|
+
# @return [IMW::Resource] this resource
|
|
118
|
+
def refresh!
|
|
119
|
+
response = HDFS.fs(:count, path)
|
|
120
|
+
if response.blank? || response =~ /^Can not find listing for/
|
|
121
|
+
@exist = false
|
|
122
|
+
@num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
|
|
123
|
+
else
|
|
124
|
+
@exist = true
|
|
125
|
+
parts = response.split
|
|
126
|
+
@num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
|
|
127
|
+
@hdfs_path = parts.last
|
|
128
|
+
end
|
|
129
|
+
self
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Execute +command+ with +args+ on the Hadoop Distributed
|
|
133
|
+
# Filesystem (HDFS).
|
|
134
|
+
#
|
|
135
|
+
# If passed a block, yield each line of the output from the
|
|
136
|
+
# command, else just return the output.
|
|
137
|
+
#
|
|
138
|
+
# Try running `hadoop fs -help' for more information.
|
|
139
|
+
#
|
|
140
|
+
# @param [String, Symbol] command the command to run.
|
|
141
|
+
# @param [String, Symbol] args the arguments to pass the command
|
|
142
|
+
# @yield [String] each line of the command's output
|
|
143
|
+
# @return [String] the command's output
|
|
144
|
+
def self.fs command, *args
|
|
145
|
+
command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
|
|
146
|
+
command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
|
|
147
|
+
output = `#{command_string}`.chomp
|
|
148
|
+
if block_given?
|
|
149
|
+
output.split("\n").each do |line|
|
|
150
|
+
yield line
|
|
151
|
+
end
|
|
152
|
+
else
|
|
153
|
+
output
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
protected
|
|
158
|
+
# Returns the path to the Hadoop executable.
|
|
159
|
+
#
|
|
160
|
+
# @return [String]
|
|
161
|
+
def self.executable
|
|
162
|
+
@executable ||= begin
|
|
163
|
+
string = `which hadoop`.chomp
|
|
164
|
+
raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
|
|
165
|
+
string
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Defines methods for reading data from HDFS files.
|
|
171
|
+
module HDFSFile
|
|
172
|
+
|
|
173
|
+
# Return the contents of this HDFS file as a string.
|
|
174
|
+
#
|
|
175
|
+
# Be VERY careful how you use this!
|
|
176
|
+
#
|
|
177
|
+
# @return [String]
|
|
178
|
+
def read
|
|
179
|
+
HDFS.fs(:cat, path)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Iterate through each line of this HDFS resource.
|
|
183
|
+
#
|
|
184
|
+
# @yield [String] each line of the file
|
|
185
|
+
def each &block
|
|
186
|
+
HDFS.fs(:cat, path, &block)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Return a handle on a StringIO object representing the
|
|
190
|
+
# content in this HDFS file.
|
|
191
|
+
#
|
|
192
|
+
# Be VERY careful how you use this! It is a StringIO object
|
|
193
|
+
# so the whole HDFS file is read into a string before
|
|
194
|
+
# returning the handle.
|
|
195
|
+
#
|
|
196
|
+
# @return [StringIO]
|
|
197
|
+
def io
|
|
198
|
+
@io ||= StringIO.new(read)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Map over the lines of this HDFS resource.
|
|
202
|
+
#
|
|
203
|
+
# @yield [String] each line of the file
|
|
204
|
+
# @return [Array] the result of the block on each line
|
|
205
|
+
def map &block
|
|
206
|
+
returning([]) do |output|
|
|
207
|
+
HDFS.fs(:cat, path) do |line|
|
|
208
|
+
output << block.call(line)
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Defines methods for listing contents of HDFS directories.
|
|
216
|
+
module HDFSDirectory
|
|
217
|
+
|
|
218
|
+
# Return the paths of all files and directories directly below
|
|
219
|
+
# this directory on the HDFS.
|
|
220
|
+
#
|
|
221
|
+
# @return [Array<String>]
|
|
222
|
+
def contents
|
|
223
|
+
returning([]) do |paths|
|
|
224
|
+
HDFS.fs(:ls, path) do |line|
|
|
225
|
+
next if line =~ /^Found.*items$/
|
|
226
|
+
paths << line.split.last
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Return the resources directly below this directory on the
|
|
232
|
+
# HDFS.
|
|
233
|
+
#
|
|
234
|
+
# @return [Array<IMW::Resource>]
|
|
235
|
+
def resources
|
|
236
|
+
contents.map { |path| IMW.open(path) }
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
module Schemes
|
|
4
|
+
|
|
5
|
+
# Defines methods for accessing a resource over HTTP. Uses
|
|
6
|
+
# RestClient to implement the basic HTTP verbs (GET, POST, PUT,
|
|
7
|
+
# DELETE, HEAD).
|
|
8
|
+
module HTTP
|
|
9
|
+
|
|
10
|
+
# Is this resource being accessed via HTTP?
|
|
11
|
+
#
|
|
12
|
+
# @return [true, false]
|
|
13
|
+
def via_http?
|
|
14
|
+
true
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Copy this resource to the +new_uri+.
|
|
18
|
+
#
|
|
19
|
+
# @param [String, IMW::Resource] new_uri
|
|
20
|
+
# @return [IMW::Resource] the new resource
|
|
21
|
+
def cp new_uri
|
|
22
|
+
IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Return the basename of the URI or <tt>_index</tt> if it's
|
|
27
|
+
# blank, as in the case of <tt>http://www.google.com</tt>.
|
|
28
|
+
#
|
|
29
|
+
# @return [String]
|
|
30
|
+
def effective_basename
|
|
31
|
+
(basename.blank? || basename =~ %r{^/*$}) ? "_index" : basename
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Send a GET request to this resource's URI.
|
|
35
|
+
#
|
|
36
|
+
# If the response doesn't have HTTP code 2xx, a RestClient
|
|
37
|
+
# error will be raised.
|
|
38
|
+
#
|
|
39
|
+
# If a block is given then the response will be passed to the
|
|
40
|
+
# block, even in case of a non-2xx code.
|
|
41
|
+
#
|
|
42
|
+
# See the documentation for
|
|
43
|
+
# RestClient[http://rdoc.info/projects/archiloque/rest-client]
|
|
44
|
+
# for more information.
|
|
45
|
+
#
|
|
46
|
+
# @param [Hash] headers the headers to include in the request
|
|
47
|
+
# @yield [RestClient::Response] the response from the server
|
|
48
|
+
# @return [RestClient::Response] the response from the server
|
|
49
|
+
# @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
|
|
50
|
+
def get headers={}, &block
|
|
51
|
+
make_restclient_request do
|
|
52
|
+
RestClient.get(uri.to_s, headers, &block)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Send a POST request to this resource's URI with data
|
|
57
|
+
# +payload+.
|
|
58
|
+
#
|
|
59
|
+
# If the response doesn't have HTTP code 2xx, a RestClient
|
|
60
|
+
# error will be raised.
|
|
61
|
+
#
|
|
62
|
+
# If a block is given then the response will be passed to the
|
|
63
|
+
# block, even in case of a non-2xx code.
|
|
64
|
+
#
|
|
65
|
+
# See the documentation for
|
|
66
|
+
# RestClient[http://rdoc.info/projects/archiloque/rest-client]
|
|
67
|
+
# for more information.
|
|
68
|
+
#
|
|
69
|
+
# @param [Hash, String] payload the data to send
|
|
70
|
+
# @param [Hash] headers the headers to include in the request
|
|
71
|
+
# @yield [RestClient::Response] the response from the server
|
|
72
|
+
# @return [RestClient::Response] the response from the server
|
|
73
|
+
# @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
|
|
74
|
+
def post payload, headers={}, &block
|
|
75
|
+
make_restclient_request do
|
|
76
|
+
RestClient.post(uri.to_s, payload, headers, &block)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Send a PUT request to this resource's URI with data
|
|
81
|
+
# +payload+.
|
|
82
|
+
#
|
|
83
|
+
# If the response doesn't have HTTP code 2xx, a RestClient
|
|
84
|
+
# error will be raised.
|
|
85
|
+
#
|
|
86
|
+
# If a block is given then the response will be passed to the
|
|
87
|
+
# block, even in case of a non-2xx code.
|
|
88
|
+
#
|
|
89
|
+
# See the documentation for
|
|
90
|
+
# RestClient[http://rdoc.info/projects/archiloque/rest-client]
|
|
91
|
+
# for more information.
|
|
92
|
+
#
|
|
93
|
+
# @param [Hash, String] payload the data to send
|
|
94
|
+
# @param [Hash] headers the headers to include in the request
|
|
95
|
+
# @yield [RestClient::Response] the response from the server
|
|
96
|
+
# @return [RestClient::Response] the response from the server
|
|
97
|
+
# @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
|
|
98
|
+
def put payload, headers={}, &block
|
|
99
|
+
make_restclient_request do
|
|
100
|
+
RestClient.put(uri.to_s, payload, headers, &block)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Send a DELETE request to this resource's URI.
|
|
105
|
+
#
|
|
106
|
+
# If the response doesn't have HTTP code 2xx, a RestClient
|
|
107
|
+
# error will be raised.
|
|
108
|
+
#
|
|
109
|
+
# If a block is given then the response will be passed to the
|
|
110
|
+
# block, even in case of a non-2xx code.
|
|
111
|
+
#
|
|
112
|
+
# See the documentation for
|
|
113
|
+
# RestClient[http://rdoc.info/projects/archiloque/rest-client]
|
|
114
|
+
# for more information.
|
|
115
|
+
#
|
|
116
|
+
# @param [Hash] headers the headers to include in the request
|
|
117
|
+
# @yield [RestClient::Response] the response from the server
|
|
118
|
+
# @return [RestClient::Response] the response from the server
|
|
119
|
+
# @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
|
|
120
|
+
def delete headers={}, &block
|
|
121
|
+
make_restclient_request do
|
|
122
|
+
RestClient.delete(uri.to_s, headers, &block)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Send a HEAD request to this resource's URI.
|
|
127
|
+
#
|
|
128
|
+
# If the response doesn't have HTTP code 2xx, a RestClient
|
|
129
|
+
# error will be raised.
|
|
130
|
+
#
|
|
131
|
+
# If a block is given then the response will be passed to the
|
|
132
|
+
# block, even in case of a non-2xx code.
|
|
133
|
+
#
|
|
134
|
+
# See the documentation for
|
|
135
|
+
# RestClient[http://rdoc.info/projects/archiloque/rest-client]
|
|
136
|
+
# for more information.
|
|
137
|
+
#
|
|
138
|
+
# @param [Hash] headers the headers to include in the request
|
|
139
|
+
# @yield [RestClient::Response] the response from the server
|
|
140
|
+
# @return [RestClient::Response] the response from the server
|
|
141
|
+
# @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
|
|
142
|
+
def head headers={}, &block
|
|
143
|
+
make_restclient_request do
|
|
144
|
+
RestClient.head(uri.to_s, headers, &block)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
protected
|
|
149
|
+
def make_restclient_request &block # :nodoc
|
|
150
|
+
require 'restclient'
|
|
151
|
+
begin
|
|
152
|
+
yield
|
|
153
|
+
rescue RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed => e
|
|
154
|
+
raise IMW::NetworkError.new("#{e.class} -- #{e.message}")
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
module Schemes
|
|
4
|
+
|
|
5
|
+
# Defines methods for reading and writing data to {Amazon
|
|
6
|
+
# S3}[http://aws.amazon.com/s3] buckets.
|
|
7
|
+
#
|
|
8
|
+
# IMW.open('s3://my_bucket/path/to/some/file.csv')
|
|
9
|
+
#
|
|
10
|
+
# Learn more about {Amazon Web Services}[http://aws.amazon.com].
|
|
11
|
+
module S3
|
|
12
|
+
|
|
13
|
+
# For an S3 resource, the bucket is just the hostname.
|
|
14
|
+
#
|
|
15
|
+
# @return [String]
|
|
16
|
+
def bucket
|
|
17
|
+
host
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Is this resource an S3 resource?
|
|
21
|
+
#
|
|
22
|
+
# @return [true, false]
|
|
23
|
+
def on_s3?
|
|
24
|
+
true
|
|
25
|
+
end
|
|
26
|
+
alias_method :is_s3?, :on_s3?
|
|
27
|
+
|
|
28
|
+
# Copy this resource to the +new_uri+.
|
|
29
|
+
#
|
|
30
|
+
# @param [String, IMW::Resource] new_uri
|
|
31
|
+
# @return [IMW::Resource] the new resource
|
|
32
|
+
def cp new_uri
|
|
33
|
+
IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# The AWS::S3::S3Object corresponding to this resource.
|
|
37
|
+
def s3_object
|
|
38
|
+
self.class.make_connection!
|
|
39
|
+
@s3_object ||= AWS::S3::S3Object.new(path, bucket)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Does this resource exist on S3?
|
|
43
|
+
#
|
|
44
|
+
# @return [true, false]
|
|
45
|
+
def exist?
|
|
46
|
+
s3_object.exists?
|
|
47
|
+
end
|
|
48
|
+
alias_method :exists?, :exist?
|
|
49
|
+
|
|
50
|
+
# Remove this resource from S3.
|
|
51
|
+
#
|
|
52
|
+
# @return [IMW::Resource] the deleted object
|
|
53
|
+
def rm
|
|
54
|
+
s3_object.delete
|
|
55
|
+
end
|
|
56
|
+
alias_method :rm!, :rm
|
|
57
|
+
|
|
58
|
+
# Return the S3N URL for this S3 object
|
|
59
|
+
#
|
|
60
|
+
# resource = IMW.open('s3://my_bucket/path/to/some/obj')
|
|
61
|
+
# resource.s3n_url
|
|
62
|
+
# => 's3n://my_bucket/path/to/some/obj'
|
|
63
|
+
#
|
|
64
|
+
# @return [String]
|
|
65
|
+
def s3n_url
|
|
66
|
+
uri.to_s.gsub(/^s3:/, 's3n:')
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Return the contents of this S3 object.
|
|
70
|
+
#
|
|
71
|
+
# @return [String]
|
|
72
|
+
def read
|
|
73
|
+
s3_object.value
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Store +source+ into +destination+.
|
|
77
|
+
#
|
|
78
|
+
# @param [String, IMW::Resource, #io] source
|
|
79
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
80
|
+
# @return [IMW::Resource] the new S3 object
|
|
81
|
+
def self.put source, destination
|
|
82
|
+
source = IMW.open(source)
|
|
83
|
+
destintation = IMW.open(destination)
|
|
84
|
+
raise IMW::ArgumentError.new("destination must be on S3 -- #{destination.uri} given") unless destination.on_s3?
|
|
85
|
+
make_connection!
|
|
86
|
+
AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
|
|
87
|
+
destination
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Download +source+ from S3 into +destination+.
|
|
91
|
+
#
|
|
92
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
|
93
|
+
# @param [String, IMW::Resource, #write] destination
|
|
94
|
+
# @return [IMW::Resource] the new resource
|
|
95
|
+
def self.get source, destination
|
|
96
|
+
source = IMW.open(source)
|
|
97
|
+
destination = IMW.open(destination)
|
|
98
|
+
make_connection!
|
|
99
|
+
AWS::S3::Object.stream(source.path, source.bucket) do |chunk|
|
|
100
|
+
destination.write(chunk)
|
|
101
|
+
end
|
|
102
|
+
destination.close
|
|
103
|
+
destination.reopen
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Copy S3 resource +source+ to +destination+.
|
|
107
|
+
#
|
|
108
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
|
109
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
110
|
+
# @return [IMW::Resource] the new resource
|
|
111
|
+
def self.copy source, destination
|
|
112
|
+
source = IMW.open(source)
|
|
113
|
+
destination = IMW.open(destination)
|
|
114
|
+
raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
|
|
115
|
+
make_connection!
|
|
116
|
+
AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
|
|
117
|
+
destination
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
protected
|
|
121
|
+
# Make an S3 connection.
|
|
122
|
+
#
|
|
123
|
+
# Uses settings defined in IMW::AWS_CREDENTIALS.
|
|
124
|
+
#
|
|
125
|
+
# @return [AWS
|
|
126
|
+
def self.make_connection!
|
|
127
|
+
return @connection if @connection
|
|
128
|
+
raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
|
|
129
|
+
require 'aws/s3'
|
|
130
|
+
@connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Resources
|
|
3
|
+
module Schemes
|
|
4
|
+
autoload :S3, 'imw/resources/schemes/s3'
|
|
5
|
+
autoload :HTTP, 'imw/resources/schemes/http'
|
|
6
|
+
autoload :HTTPS, 'imw/resources/schemes/http'
|
|
7
|
+
autoload :HDFS, 'imw/resources/schemes/hdfs'
|
|
8
|
+
|
|
9
|
+
# Handlers which extend a resource with scheme specific methods.
|
|
10
|
+
SCHEME_HANDLERS = [
|
|
11
|
+
["Schemes::S3", %r{^s3://} ],
|
|
12
|
+
["Schemes::HTTP", %r{^http://} ],
|
|
13
|
+
["Schemes::HTTPS", %r{^https://} ],
|
|
14
|
+
["Schemes::HDFS", %r{^hdfs://} ]
|
|
15
|
+
]
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
require 'imw/resources/formats'
|
|
2
|
+
require 'imw/resources/schemes'
|
|
3
|
+
require 'imw/resources/archives_and_compressed'
|
|
4
|
+
|
|
5
|
+
module IMW
|
|
6
|
+
|
|
7
|
+
# IMW::Resources is a namespace in which all the modules which
|
|
8
|
+
# define different kinds of behavior for IMW::Resource objects are
|
|
9
|
+
# defined.
|
|
10
|
+
#
|
|
11
|
+
# When an IMW::Resource is instantiated it eventually calls
|
|
12
|
+
# IMW::Resources#extend_resource! which will iterate through the
|
|
13
|
+
# handlers in IMW::Resources#handlers, extending the resource with
|
|
14
|
+
# modules whose handler conditions are satisfied.
|
|
15
|
+
#
|
|
16
|
+
# A handler is just an Array with two elements. The first should be
|
|
17
|
+
# a module or a string identifying a module.
|
|
18
|
+
#
|
|
19
|
+
# If the second element is a Regexp, the corresponding module will
|
|
20
|
+
# be used if the regexp matches the resource's URI (as a string)
|
|
21
|
+
#
|
|
22
|
+
# If the second element is a Proc, it will be called with the
|
|
23
|
+
# resource as its only argument and if it returns true then the
|
|
24
|
+
# module will be used.
|
|
25
|
+
#
|
|
26
|
+
# You can define your own handlers by appending them to
|
|
27
|
+
# IMW::Resources::USER_DEFINED_HANDLERS in your <tt>.imwrc</tt>
|
|
28
|
+
# file.
|
|
29
|
+
module Resources
|
|
30
|
+
|
|
31
|
+
autoload :LocalObj, 'imw/resources/local'
|
|
32
|
+
autoload :RemoteObj, 'imw/resources/remote'
|
|
33
|
+
autoload :StringObj, 'imw/resources/string'
|
|
34
|
+
autoload :Transferable, 'imw/resources/transferable'
|
|
35
|
+
|
|
36
|
+
# Iterate through IMW::Resources#handlers and extend the given
|
|
37
|
+
# +resource+ with modules whose handler conditions match the
|
|
38
|
+
# resource.
|
|
39
|
+
#
|
|
40
|
+
# @param [IMW::Resource] resource the resource to extend
|
|
41
|
+
# @return [IMW::Resource] the extended resource
|
|
42
|
+
def self.extend_resource! resource
|
|
43
|
+
handlers.each do |mod_name, handler|
|
|
44
|
+
case handler
|
|
45
|
+
when Regexp then extend_resource_with_mod_or_string!(resource, mod_name) if handler =~ resource.uri.to_s
|
|
46
|
+
when Proc then extend_resource_with_mod_or_string!(resource, mod_name) if handler.call(resource)
|
|
47
|
+
when TrueClass then extend_resource_with_mod_or_string!(resource, mod_name)
|
|
48
|
+
else
|
|
49
|
+
raise IMW::TypeError("A handler must be Regexp, Proc, or true")
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
resource
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Basic handlers to determine whether the resource is local,
|
|
56
|
+
# remote, or a string.
|
|
57
|
+
BASIC_HANDLERS = [
|
|
58
|
+
["LocalObj", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
|
|
59
|
+
["RemoteObj", Proc.new { |resource| resource.scheme != 'file' && resource.scheme.present? } ],
|
|
60
|
+
["StringObj", Proc.new { |resource| resource.is_stringio? } ]
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# Define this constant in your configuration file to add your own
|
|
64
|
+
# handlers.
|
|
65
|
+
USER_DEFINED_HANDLERS = [] unless defined?(USER_DEFINED_HANDLERS)
|
|
66
|
+
|
|
67
|
+
# include handlers from other modules
|
|
68
|
+
include IMW::Resources::Formats
|
|
69
|
+
include IMW::Resources::Schemes
|
|
70
|
+
|
|
71
|
+
# A list of handlers to try. Define your own handlers in
|
|
72
|
+
# IMW::Resources::USER_DEFINED_HANDLERS.
|
|
73
|
+
#
|
|
74
|
+
# @return [Array]
|
|
75
|
+
def self.handlers
|
|
76
|
+
# order here is important
|
|
77
|
+
BASIC_HANDLERS + SCHEME_HANDLERS + ARCHIVE_AND_COMPRESSED_HANDLERS + FORMAT_HANDLERS + USER_DEFINED_HANDLERS
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
protected
|
|
81
|
+
|
|
82
|
+
# Extend +resource+ with +mod_or_string+. Will work hard to try
|
|
83
|
+
# and interpret +mod_or_string+ as a module if it's a string.
|
|
84
|
+
#
|
|
85
|
+
# @param [IMW::Resource] resource the resource to extend
|
|
86
|
+
#
|
|
87
|
+
# @param [Module, String] mod_or_string the module or string
|
|
88
|
+
# representing a module to extend the resource with
|
|
89
|
+
def self.extend_resource_with_mod_or_string! resource, mod_or_string
|
|
90
|
+
if mod_or_string.is_a?(Module)
|
|
91
|
+
resource.extend(mod_or_string)
|
|
92
|
+
else
|
|
93
|
+
# Given a string "Mod::SubMod::SubSubMod" first split it into
|
|
94
|
+
# its parts ["Mod", "SubMod", "SubSubMod"] and then begin
|
|
95
|
+
# class_eval'ing them in order so that each is class_eval'd in
|
|
96
|
+
# the scope of the one before it.
|
|
97
|
+
#
|
|
98
|
+
# There is almost certainly a better way to do this.
|
|
99
|
+
mod_names = mod_or_string.to_s.split('::')
|
|
100
|
+
mods = []
|
|
101
|
+
mod_names.each_with_index do |name, index|
|
|
102
|
+
if index == 0
|
|
103
|
+
mods << class_eval(name)
|
|
104
|
+
else
|
|
105
|
+
begin
|
|
106
|
+
mods << class_eval(name)
|
|
107
|
+
rescue NameError
|
|
108
|
+
mods << mods[index - 1].class_eval(name)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
resource.extend(mods.last)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
|