imw 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -1,19 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
- autoload :S3, 'imw/resources/schemes/s3'
5
- autoload :HTTP, 'imw/resources/schemes/http'
6
- autoload :HTTPS, 'imw/resources/schemes/http'
7
- autoload :HDFS, 'imw/resources/schemes/hdfs'
8
-
9
- # Handlers which extend a resource with scheme specific methods.
10
- SCHEME_HANDLERS = [
11
- ["Schemes::S3", %r{^s3://} ],
12
- ["Schemes::HTTP", %r{^http://} ],
13
- ["Schemes::HTTPS", %r{^https://} ],
14
- ["Schemes::HDFS", %r{^hdfs://} ]
15
- ]
16
- end
17
- end
18
- end
19
-
@@ -1,242 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for reading and writing data to/from an
6
- # HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
7
- #
8
- # Learn more about Hadoop[http://hadoop.apache.org] and the
9
- # {Hadoop Distributed
10
- # Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
11
- module HDFS
12
-
13
- # Checks to see if this is a file or directory
14
- def self.extended obj
15
- obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
16
- end
17
-
18
- # Is this resource an HDFS resource?
19
- #
20
- # @return [true, false]
21
- def on_hdfs?
22
- true
23
- end
24
- alias_method :is_hdfs?, :on_hdfs?
25
-
26
- # Copy this resource to the +new_uri+.
27
- #
28
- # @param [String, IMW::Resource] new_uri
29
- # @return [IMW::Resource] the new resource
30
- def cp new_uri
31
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
32
- end
33
-
34
- # Move this resource to the +new_uri+.
35
- #
36
- # @param [String, IMW::Resource] new_uri
37
- # @return [IMW::Resource] the new resource
38
- def mv new_uri
39
- IMW::Transforms::Transferer.new(:mv, self, new_uri).transfer!
40
- end
41
-
42
- # Delete this resource from the HDFS.
43
- #
44
- # @option options [true,false] :skip_trash
45
- def rm options={}
46
- should_exist!("Cannot delete.")
47
- args = [:rm]
48
- args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
49
- args << path
50
- HDFS.fs(*args)
51
- self
52
- end
53
- alias_method :rm!, :rm
54
-
55
-
56
- # Does this path exist on the HDFS?
57
- #
58
- # @return [true, false]
59
- def exist?
60
- return @exist unless @exist.nil?
61
- refresh!
62
- @exist
63
- end
64
- alias_method :exists?, :exist?
65
-
66
-
67
- # Return the size (in bytes) of this resource on the HDFS.
68
- #
69
- # This value is cached. Call +refresh+ to refresh the cache
70
- # manually.
71
- #
72
- # @return [Fixnum]
73
- def size
74
- return @size unless @size.nil?
75
- refresh!
76
- should_exist!("Cannot report size")
77
- @size
78
- end
79
-
80
- # Return the number of directories contained at or below this
81
- # path on the HDFS.
82
- #
83
- # This value is cached. Call +refresh+ to refresh the cache
84
- # manually.
85
- #
86
- # @return [Fixnum]
87
- def num_dirs
88
- return @num_dirs unless @num_dirs.nil?
89
- refresh!
90
- should_exist!("Cannot report number of directories.")
91
- @num_dirs
92
- end
93
-
94
- # Return the number of files contained at or below this path
95
- # on the HDFS.
96
- #
97
- # This value is cached. Call +refresh+ to refresh the cache
98
- # manually.
99
- #
100
- # @return [Fixnum]
101
- def num_files
102
- return @num_files unless @num_files.nil?
103
- refresh!
104
- should_exist!("Cannot report number of files.")
105
- @num_files
106
- end
107
-
108
- # Is this resource an HDFS directory?
109
- #
110
- # @return [true, false]
111
- def is_directory?
112
- exist? && num_dirs > 0
113
- end
114
-
115
- # Refresh the cached file properties.
116
- #
117
- # @return [IMW::Resource] this resource
118
- def refresh!
119
- response = HDFS.fs(:count, path)
120
- if response.blank? || response =~ /^Can not find listing for/
121
- @exist = false
122
- @num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
123
- else
124
- @exist = true
125
- parts = response.split
126
- @num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
127
- @hdfs_path = parts.last
128
- end
129
- self
130
- end
131
-
132
- # Execute +command+ with +args+ on the Hadoop Distributed
133
- # Filesystem (HDFS).
134
- #
135
- # If passed a block, yield each line of the output from the
136
- # command, else just return the output.
137
- #
138
- # Try running `hadoop fs -help' for more information.
139
- #
140
- # @param [String, Symbol] command the command to run.
141
- # @param [String, Symbol] args the arguments to pass the command
142
- # @yield [String] each line of the command's output
143
- # @return [String] the command's output
144
- def self.fs command, *args
145
- command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
146
- command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
147
- output = `#{command_string}`.chomp
148
- if block_given?
149
- output.split("\n").each do |line|
150
- yield line
151
- end
152
- else
153
- output
154
- end
155
- end
156
-
157
- protected
158
- # Returns the path to the Hadoop executable.
159
- #
160
- # @return [String]
161
- def self.executable
162
- @executable ||= begin
163
- string = `which hadoop`.chomp
164
- raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
165
- string
166
- end
167
- end
168
- end
169
-
170
- # Defines methods for reading data from HDFS files.
171
- module HDFSFile
172
-
173
- # Return the contents of this HDFS file as a string.
174
- #
175
- # Be VERY careful how you use this!
176
- #
177
- # @return [String]
178
- def read
179
- HDFS.fs(:cat, path)
180
- end
181
-
182
- # Iterate through each line of this HDFS resource.
183
- #
184
- # @yield [String] each line of the file
185
- def each &block
186
- HDFS.fs(:cat, path, &block)
187
- end
188
-
189
- # Return a handle on a StringIO object representing the
190
- # content in this HDFS file.
191
- #
192
- # Be VERY careful how you use this! It is a StringIO object
193
- # so the whole HDFS file is read into a string before
194
- # returning the handle.
195
- #
196
- # @return [StringIO]
197
- def io
198
- @io ||= StringIO.new(read)
199
- end
200
-
201
- # Map over the lines of this HDFS resource.
202
- #
203
- # @yield [String] each line of the file
204
- # @return [Array] the result of the block on each line
205
- def map &block
206
- returning([]) do |output|
207
- HDFS.fs(:cat, path) do |line|
208
- output << block.call(line)
209
- end
210
- end
211
- end
212
-
213
- end
214
-
215
- # Defines methods for listing contents of HDFS directories.
216
- module HDFSDirectory
217
-
218
- # Return the paths of all files and directories directly below
219
- # this directory on the HDFS.
220
- #
221
- # @return [Array<String>]
222
- def contents
223
- returning([]) do |paths|
224
- HDFS.fs(:ls, path) do |line|
225
- next if line =~ /^Found.*items$/
226
- paths << line.split.last
227
- end
228
- end
229
- end
230
-
231
- # Return the resources directly below this directory on the
232
- # HDFS.
233
- #
234
- # @return [Array<IMW::Resource>]
235
- def resources
236
- contents.map { |path| IMW.open(path) }
237
- end
238
-
239
- end
240
- end
241
- end
242
- end
@@ -1,161 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for accessing a resource over HTTP. Uses
6
- # RestClient to implement the basic HTTP verbs (GET, POST, PUT,
7
- # DELETE, HEAD).
8
- module HTTP
9
-
10
- # Is this resource being accessed via HTTP?
11
- #
12
- # @return [true, false]
13
- def via_http?
14
- true
15
- end
16
-
17
- # Copy this resource to the +new_uri+.
18
- #
19
- # @param [String, IMW::Resource] new_uri
20
- # @return [IMW::Resource] the new resource
21
- def cp new_uri
22
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
23
- end
24
-
25
-
26
- # Return the basename of the URI or <tt>_index</tt> if it's
27
- # blank, as in the case of <tt>http://www.google.com</tt>.
28
- #
29
- # @return [String]
30
- def effective_basename
31
- (basename.blank? || basename =~ %r{^/*$}) ? "_index" : basename
32
- end
33
-
34
- # Send a GET request to this resource's URI.
35
- #
36
- # If the response doesn't have HTTP code 2xx, a RestClient
37
- # error will be raised.
38
- #
39
- # If a block is given then the response will be passed to the
40
- # block, even in case of a non-2xx code.
41
- #
42
- # See the documentation for
43
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
44
- # for more information.
45
- #
46
- # @param [Hash] headers the headers to include in the request
47
- # @yield [RestClient::Response] the response from the server
48
- # @return [RestClient::Response] the response from the server
49
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
50
- def get headers={}, &block
51
- make_restclient_request do
52
- RestClient.get(uri.to_s, headers, &block)
53
- end
54
- end
55
-
56
- # Send a POST request to this resource's URI with data
57
- # +payload+.
58
- #
59
- # If the response doesn't have HTTP code 2xx, a RestClient
60
- # error will be raised.
61
- #
62
- # If a block is given then the response will be passed to the
63
- # block, even in case of a non-2xx code.
64
- #
65
- # See the documentation for
66
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
67
- # for more information.
68
- #
69
- # @param [Hash, String] payload the data to send
70
- # @param [Hash] headers the headers to include in the request
71
- # @yield [RestClient::Response] the response from the server
72
- # @return [RestClient::Response] the response from the server
73
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
74
- def post payload, headers={}, &block
75
- make_restclient_request do
76
- RestClient.post(uri.to_s, payload, headers, &block)
77
- end
78
- end
79
-
80
- # Send a PUT request to this resource's URI with data
81
- # +payload+.
82
- #
83
- # If the response doesn't have HTTP code 2xx, a RestClient
84
- # error will be raised.
85
- #
86
- # If a block is given then the response will be passed to the
87
- # block, even in case of a non-2xx code.
88
- #
89
- # See the documentation for
90
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
91
- # for more information.
92
- #
93
- # @param [Hash, String] payload the data to send
94
- # @param [Hash] headers the headers to include in the request
95
- # @yield [RestClient::Response] the response from the server
96
- # @return [RestClient::Response] the response from the server
97
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
98
- def put payload, headers={}, &block
99
- make_restclient_request do
100
- RestClient.put(uri.to_s, payload, headers, &block)
101
- end
102
- end
103
-
104
- # Send a DELETE request to this resource's URI.
105
- #
106
- # If the response doesn't have HTTP code 2xx, a RestClient
107
- # error will be raised.
108
- #
109
- # If a block is given then the response will be passed to the
110
- # block, even in case of a non-2xx code.
111
- #
112
- # See the documentation for
113
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
114
- # for more information.
115
- #
116
- # @param [Hash] headers the headers to include in the request
117
- # @yield [RestClient::Response] the response from the server
118
- # @return [RestClient::Response] the response from the server
119
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
120
- def delete headers={}, &block
121
- make_restclient_request do
122
- RestClient.delete(uri.to_s, headers, &block)
123
- end
124
- end
125
-
126
- # Send a HEAD request to this resource's URI.
127
- #
128
- # If the response doesn't have HTTP code 2xx, a RestClient
129
- # error will be raised.
130
- #
131
- # If a block is given then the response will be passed to the
132
- # block, even in case of a non-2xx code.
133
- #
134
- # See the documentation for
135
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
136
- # for more information.
137
- #
138
- # @param [Hash] headers the headers to include in the request
139
- # @yield [RestClient::Response] the response from the server
140
- # @return [RestClient::Response] the response from the server
141
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
142
- def head headers={}, &block
143
- make_restclient_request do
144
- RestClient.head(uri.to_s, headers, &block)
145
- end
146
- end
147
-
148
- protected
149
- def make_restclient_request &block # :nodoc
150
- require 'restclient'
151
- begin
152
- yield
153
- rescue RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed => e
154
- raise IMW::NetworkError.new("#{e.class} -- #{e.message}")
155
- end
156
- end
157
- end
158
- end
159
- end
160
- end
161
-
@@ -1,137 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for reading and writing data to {Amazon
6
- # S3}[http://aws.amazon.com/s3] buckets.
7
- #
8
- # IMW.open('s3://my_bucket/path/to/some/file.csv')
9
- #
10
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
11
- module S3
12
-
13
- # For an S3 resource, the bucket is just the hostname.
14
- #
15
- # @return [String]
16
- def bucket
17
- host
18
- end
19
-
20
- # Is this resource an S3 resource?
21
- #
22
- # @return [true, false]
23
- def on_s3?
24
- true
25
- end
26
- alias_method :is_s3?, :on_s3?
27
-
28
- # Copy this resource to the +new_uri+.
29
- #
30
- # @param [String, IMW::Resource] new_uri
31
- # @return [IMW::Resource] the new resource
32
- def cp new_uri
33
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
34
- end
35
-
36
- # The AWS::S3::S3Object corresponding to this resource.
37
- def s3_object
38
- self.class.make_connection!
39
- @s3_object ||= AWS::S3::S3Object.new(path, bucket)
40
- end
41
-
42
- # Does this resource exist on S3?
43
- #
44
- # @return [true, false]
45
- def exist?
46
- s3_object.exists?
47
- end
48
- alias_method :exists?, :exist?
49
-
50
- # Remove this resource from S3.
51
- #
52
- # @return [IMW::Resource] the deleted object
53
- def rm
54
- s3_object.delete
55
- end
56
- alias_method :rm!, :rm
57
-
58
- # Return the S3N URL for this S3 object
59
- #
60
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
61
- # resource.s3n_url
62
- # => 's3n://my_bucket/path/to/some/obj'
63
- #
64
- # @return [String]
65
- def s3n_url
66
- uri.to_s.gsub(/^s3:/, 's3n:')
67
- end
68
-
69
- # Return the contents of this S3 object.
70
- #
71
- # @return [String]
72
- def read
73
- s3_object.value
74
- end
75
-
76
- # Store +source+ into +destination+.
77
- #
78
- # @param [String, IMW::Resource, #io] source
79
- # @param [String, IMW::Resource, #path, #bucket] destination
80
- # @return [IMW::Resource] the new S3 object
81
- def self.put source, destination
82
- source = IMW.open(source)
83
- destintation = IMW.open(destination)
84
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination.uri} given") unless destination.on_s3?
85
- make_connection!
86
- AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
87
- destination
88
- end
89
-
90
- # Download +source+ from S3 into +destination+.
91
- #
92
- # @param [String, IMW::Resource, #path, #bucket] source
93
- # @param [String, IMW::Resource, #write] destination
94
- # @return [IMW::Resource] the new resource
95
- def self.get source, destination
96
- source = IMW.open(source)
97
- destination = IMW.open(destination)
98
- make_connection!
99
- AWS::S3::Object.stream(source.path, source.bucket) do |chunk|
100
- destination.write(chunk)
101
- end
102
- destination.close
103
- destination.reopen
104
- end
105
-
106
- # Copy S3 resource +source+ to +destination+.
107
- #
108
- # @param [String, IMW::Resource, #path, #bucket] source
109
- # @param [String, IMW::Resource, #path, #bucket] destination
110
- # @return [IMW::Resource] the new resource
111
- def self.copy source, destination
112
- source = IMW.open(source)
113
- destination = IMW.open(destination)
114
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
115
- make_connection!
116
- AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
117
- destination
118
- end
119
-
120
- protected
121
- # Make an S3 connection.
122
- #
123
- # Uses settings defined in IMW::AWS_CREDENTIALS.
124
- #
125
- # @return [AWS
126
- def self.make_connection!
127
- return @connection if @connection
128
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
129
- require 'aws/s3'
130
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
131
- end
132
-
133
- end
134
- end
135
- end
136
- end
137
-