imw 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -1,19 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
- autoload :S3, 'imw/resources/schemes/s3'
5
- autoload :HTTP, 'imw/resources/schemes/http'
6
- autoload :HTTPS, 'imw/resources/schemes/http'
7
- autoload :HDFS, 'imw/resources/schemes/hdfs'
8
-
9
- # Handlers which extend a resource with scheme specific methods.
10
- SCHEME_HANDLERS = [
11
- ["Schemes::S3", %r{^s3://} ],
12
- ["Schemes::HTTP", %r{^http://} ],
13
- ["Schemes::HTTPS", %r{^https://} ],
14
- ["Schemes::HDFS", %r{^hdfs://} ]
15
- ]
16
- end
17
- end
18
- end
19
-
@@ -1,242 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for reading and writing data to/from an
6
- # HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
7
- #
8
- # Learn more about Hadoop[http://hadoop.apache.org] and the
9
- # {Hadoop Distributed
10
- # Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
11
- module HDFS
12
-
13
- # Checks to see if this is a file or directory
14
- def self.extended obj
15
- obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
16
- end
17
-
18
- # Is this resource an HDFS resource?
19
- #
20
- # @return [true, false]
21
- def on_hdfs?
22
- true
23
- end
24
- alias_method :is_hdfs?, :on_hdfs?
25
-
26
- # Copy this resource to the +new_uri+.
27
- #
28
- # @param [String, IMW::Resource] new_uri
29
- # @return [IMW::Resource] the new resource
30
- def cp new_uri
31
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
32
- end
33
-
34
- # Move this resource to the +new_uri+.
35
- #
36
- # @param [String, IMW::Resource] new_uri
37
- # @return [IMW::Resource] the new resource
38
- def mv new_uri
39
- IMW::Transforms::Transferer.new(:mv, self, new_uri).transfer!
40
- end
41
-
42
- # Delete this resource from the HDFS.
43
- #
44
- # @option options [true,false] :skip_trash
45
- def rm options={}
46
- should_exist!("Cannot delete.")
47
- args = [:rm]
48
- args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
49
- args << path
50
- HDFS.fs(*args)
51
- self
52
- end
53
- alias_method :rm!, :rm
54
-
55
-
56
- # Does this path exist on the HDFS?
57
- #
58
- # @return [true, false]
59
- def exist?
60
- return @exist unless @exist.nil?
61
- refresh!
62
- @exist
63
- end
64
- alias_method :exists?, :exist?
65
-
66
-
67
- # Return the size (in bytes) of this resource on the HDFS.
68
- #
69
- # This value is cached. Call +refresh+ to refresh the cache
70
- # manually.
71
- #
72
- # @return [Fixnum]
73
- def size
74
- return @size unless @size.nil?
75
- refresh!
76
- should_exist!("Cannot report size")
77
- @size
78
- end
79
-
80
- # Return the number of directories contained at or below this
81
- # path on the HDFS.
82
- #
83
- # This value is cached. Call +refresh+ to refresh the cache
84
- # manually.
85
- #
86
- # @return [Fixnum]
87
- def num_dirs
88
- return @num_dirs unless @num_dirs.nil?
89
- refresh!
90
- should_exist!("Cannot report number of directories.")
91
- @num_dirs
92
- end
93
-
94
- # Return the number of files contained at or below this path
95
- # on the HDFS.
96
- #
97
- # This value is cached. Call +refresh+ to refresh the cache
98
- # manually.
99
- #
100
- # @return [Fixnum]
101
- def num_files
102
- return @num_files unless @num_files.nil?
103
- refresh!
104
- should_exist!("Cannot report number of files.")
105
- @num_files
106
- end
107
-
108
- # Is this resource an HDFS directory?
109
- #
110
- # @return [true, false]
111
- def is_directory?
112
- exist? && num_dirs > 0
113
- end
114
-
115
- # Refresh the cached file properties.
116
- #
117
- # @return [IMW::Resource] this resource
118
- def refresh!
119
- response = HDFS.fs(:count, path)
120
- if response.blank? || response =~ /^Can not find listing for/
121
- @exist = false
122
- @num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
123
- else
124
- @exist = true
125
- parts = response.split
126
- @num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
127
- @hdfs_path = parts.last
128
- end
129
- self
130
- end
131
-
132
- # Execute +command+ with +args+ on the Hadoop Distributed
133
- # Filesystem (HDFS).
134
- #
135
- # If passed a block, yield each line of the output from the
136
- # command, else just return the output.
137
- #
138
- # Try running `hadoop fs -help' for more information.
139
- #
140
- # @param [String, Symbol] command the command to run.
141
- # @param [String, Symbol] args the arguments to pass the command
142
- # @yield [String] each line of the command's output
143
- # @return [String] the command's output
144
- def self.fs command, *args
145
- command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
146
- command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
147
- output = `#{command_string}`.chomp
148
- if block_given?
149
- output.split("\n").each do |line|
150
- yield line
151
- end
152
- else
153
- output
154
- end
155
- end
156
-
157
- protected
158
- # Returns the path to the Hadoop executable.
159
- #
160
- # @return [String]
161
- def self.executable
162
- @executable ||= begin
163
- string = `which hadoop`.chomp
164
- raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
165
- string
166
- end
167
- end
168
- end
169
-
170
- # Defines methods for reading data from HDFS files.
171
- module HDFSFile
172
-
173
- # Return the contents of this HDFS file as a string.
174
- #
175
- # Be VERY careful how you use this!
176
- #
177
- # @return [String]
178
- def read
179
- HDFS.fs(:cat, path)
180
- end
181
-
182
- # Iterate through each line of this HDFS resource.
183
- #
184
- # @yield [String] each line of the file
185
- def each &block
186
- HDFS.fs(:cat, path, &block)
187
- end
188
-
189
- # Return a handle on a StringIO object representing the
190
- # content in this HDFS file.
191
- #
192
- # Be VERY careful how you use this! It is a StringIO object
193
- # so the whole HDFS file is read into a string before
194
- # returning the handle.
195
- #
196
- # @return [StringIO]
197
- def io
198
- @io ||= StringIO.new(read)
199
- end
200
-
201
- # Map over the lines of this HDFS resource.
202
- #
203
- # @yield [String] each line of the file
204
- # @return [Array] the result of the block on each line
205
- def map &block
206
- returning([]) do |output|
207
- HDFS.fs(:cat, path) do |line|
208
- output << block.call(line)
209
- end
210
- end
211
- end
212
-
213
- end
214
-
215
- # Defines methods for listing contents of HDFS directories.
216
- module HDFSDirectory
217
-
218
- # Return the paths of all files and directories directly below
219
- # this directory on the HDFS.
220
- #
221
- # @return [Array<String>]
222
- def contents
223
- returning([]) do |paths|
224
- HDFS.fs(:ls, path) do |line|
225
- next if line =~ /^Found.*items$/
226
- paths << line.split.last
227
- end
228
- end
229
- end
230
-
231
- # Return the resources directly below this directory on the
232
- # HDFS.
233
- #
234
- # @return [Array<IMW::Resource>]
235
- def resources
236
- contents.map { |path| IMW.open(path) }
237
- end
238
-
239
- end
240
- end
241
- end
242
- end
@@ -1,161 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for accessing a resource over HTTP. Uses
6
- # RestClient to implement the basic HTTP verbs (GET, POST, PUT,
7
- # DELETE, HEAD).
8
- module HTTP
9
-
10
- # Is this resource being accessed via HTTP?
11
- #
12
- # @return [true, false]
13
- def via_http?
14
- true
15
- end
16
-
17
- # Copy this resource to the +new_uri+.
18
- #
19
- # @param [String, IMW::Resource] new_uri
20
- # @return [IMW::Resource] the new resource
21
- def cp new_uri
22
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
23
- end
24
-
25
-
26
- # Return the basename of the URI or <tt>_index</tt> if it's
27
- # blank, as in the case of <tt>http://www.google.com</tt>.
28
- #
29
- # @return [String]
30
- def effective_basename
31
- (basename.blank? || basename =~ %r{^/*$}) ? "_index" : basename
32
- end
33
-
34
- # Send a GET request to this resource's URI.
35
- #
36
- # If the response doesn't have HTTP code 2xx, a RestClient
37
- # error will be raised.
38
- #
39
- # If a block is given then the response will be passed to the
40
- # block, even in case of a non-2xx code.
41
- #
42
- # See the documentation for
43
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
44
- # for more information.
45
- #
46
- # @param [Hash] headers the headers to include in the request
47
- # @yield [RestClient::Response] the response from the server
48
- # @return [RestClient::Response] the response from the server
49
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
50
- def get headers={}, &block
51
- make_restclient_request do
52
- RestClient.get(uri.to_s, headers, &block)
53
- end
54
- end
55
-
56
- # Send a POST request to this resource's URI with data
57
- # +payload+.
58
- #
59
- # If the response doesn't have HTTP code 2xx, a RestClient
60
- # error will be raised.
61
- #
62
- # If a block is given then the response will be passed to the
63
- # block, even in case of a non-2xx code.
64
- #
65
- # See the documentation for
66
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
67
- # for more information.
68
- #
69
- # @param [Hash, String] payload the data to send
70
- # @param [Hash] headers the headers to include in the request
71
- # @yield [RestClient::Response] the response from the server
72
- # @return [RestClient::Response] the response from the server
73
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
74
- def post payload, headers={}, &block
75
- make_restclient_request do
76
- RestClient.post(uri.to_s, payload, headers, &block)
77
- end
78
- end
79
-
80
- # Send a PUT request to this resource's URI with data
81
- # +payload+.
82
- #
83
- # If the response doesn't have HTTP code 2xx, a RestClient
84
- # error will be raised.
85
- #
86
- # If a block is given then the response will be passed to the
87
- # block, even in case of a non-2xx code.
88
- #
89
- # See the documentation for
90
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
91
- # for more information.
92
- #
93
- # @param [Hash, String] payload the data to send
94
- # @param [Hash] headers the headers to include in the request
95
- # @yield [RestClient::Response] the response from the server
96
- # @return [RestClient::Response] the response from the server
97
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
98
- def put payload, headers={}, &block
99
- make_restclient_request do
100
- RestClient.put(uri.to_s, payload, headers, &block)
101
- end
102
- end
103
-
104
- # Send a DELETE request to this resource's URI.
105
- #
106
- # If the response doesn't have HTTP code 2xx, a RestClient
107
- # error will be raised.
108
- #
109
- # If a block is given then the response will be passed to the
110
- # block, even in case of a non-2xx code.
111
- #
112
- # See the documentation for
113
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
114
- # for more information.
115
- #
116
- # @param [Hash] headers the headers to include in the request
117
- # @yield [RestClient::Response] the response from the server
118
- # @return [RestClient::Response] the response from the server
119
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
120
- def delete headers={}, &block
121
- make_restclient_request do
122
- RestClient.delete(uri.to_s, headers, &block)
123
- end
124
- end
125
-
126
- # Send a HEAD request to this resource's URI.
127
- #
128
- # If the response doesn't have HTTP code 2xx, a RestClient
129
- # error will be raised.
130
- #
131
- # If a block is given then the response will be passed to the
132
- # block, even in case of a non-2xx code.
133
- #
134
- # See the documentation for
135
- # RestClient[http://rdoc.info/projects/archiloque/rest-client]
136
- # for more information.
137
- #
138
- # @param [Hash] headers the headers to include in the request
139
- # @yield [RestClient::Response] the response from the server
140
- # @return [RestClient::Response] the response from the server
141
- # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
142
- def head headers={}, &block
143
- make_restclient_request do
144
- RestClient.head(uri.to_s, headers, &block)
145
- end
146
- end
147
-
148
- protected
149
- def make_restclient_request &block # :nodoc
150
- require 'restclient'
151
- begin
152
- yield
153
- rescue RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed => e
154
- raise IMW::NetworkError.new("#{e.class} -- #{e.message}")
155
- end
156
- end
157
- end
158
- end
159
- end
160
- end
161
-
@@ -1,137 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Schemes
4
-
5
- # Defines methods for reading and writing data to {Amazon
6
- # S3}[http://aws.amazon.com/s3] buckets.
7
- #
8
- # IMW.open('s3://my_bucket/path/to/some/file.csv')
9
- #
10
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
11
- module S3
12
-
13
- # For an S3 resource, the bucket is just the hostname.
14
- #
15
- # @return [String]
16
- def bucket
17
- host
18
- end
19
-
20
- # Is this resource an S3 resource?
21
- #
22
- # @return [true, false]
23
- def on_s3?
24
- true
25
- end
26
- alias_method :is_s3?, :on_s3?
27
-
28
- # Copy this resource to the +new_uri+.
29
- #
30
- # @param [String, IMW::Resource] new_uri
31
- # @return [IMW::Resource] the new resource
32
- def cp new_uri
33
- IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
34
- end
35
-
36
- # The AWS::S3::S3Object corresponding to this resource.
37
- def s3_object
38
- self.class.make_connection!
39
- @s3_object ||= AWS::S3::S3Object.new(path, bucket)
40
- end
41
-
42
- # Does this resource exist on S3?
43
- #
44
- # @return [true, false]
45
- def exist?
46
- s3_object.exists?
47
- end
48
- alias_method :exists?, :exist?
49
-
50
- # Remove this resource from S3.
51
- #
52
- # @return [IMW::Resource] the deleted object
53
- def rm
54
- s3_object.delete
55
- end
56
- alias_method :rm!, :rm
57
-
58
- # Return the S3N URL for this S3 object
59
- #
60
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
61
- # resource.s3n_url
62
- # => 's3n://my_bucket/path/to/some/obj'
63
- #
64
- # @return [String]
65
- def s3n_url
66
- uri.to_s.gsub(/^s3:/, 's3n:')
67
- end
68
-
69
- # Return the contents of this S3 object.
70
- #
71
- # @return [String]
72
- def read
73
- s3_object.value
74
- end
75
-
76
- # Store +source+ into +destination+.
77
- #
78
- # @param [String, IMW::Resource, #io] source
79
- # @param [String, IMW::Resource, #path, #bucket] destination
80
- # @return [IMW::Resource] the new S3 object
81
- def self.put source, destination
82
- source = IMW.open(source)
83
- destintation = IMW.open(destination)
84
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination.uri} given") unless destination.on_s3?
85
- make_connection!
86
- AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
87
- destination
88
- end
89
-
90
- # Download +source+ from S3 into +destination+.
91
- #
92
- # @param [String, IMW::Resource, #path, #bucket] source
93
- # @param [String, IMW::Resource, #write] destination
94
- # @return [IMW::Resource] the new resource
95
- def self.get source, destination
96
- source = IMW.open(source)
97
- destination = IMW.open(destination)
98
- make_connection!
99
- AWS::S3::Object.stream(source.path, source.bucket) do |chunk|
100
- destination.write(chunk)
101
- end
102
- destination.close
103
- destination.reopen
104
- end
105
-
106
- # Copy S3 resource +source+ to +destination+.
107
- #
108
- # @param [String, IMW::Resource, #path, #bucket] source
109
- # @param [String, IMW::Resource, #path, #bucket] destination
110
- # @return [IMW::Resource] the new resource
111
- def self.copy source, destination
112
- source = IMW.open(source)
113
- destination = IMW.open(destination)
114
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
115
- make_connection!
116
- AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
117
- destination
118
- end
119
-
120
- protected
121
- # Make an S3 connection.
122
- #
123
- # Uses settings defined in IMW::AWS_CREDENTIALS.
124
- #
125
- # @return [AWS
126
- def self.make_connection!
127
- return @connection if @connection
128
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
129
- require 'aws/s3'
130
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
131
- end
132
-
133
- end
134
- end
135
- end
136
- end
137
-