imw 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
@@ -0,0 +1,242 @@
1
+ module IMW
2
+ module Resources
3
+ module Schemes
4
+
5
+ # Defines methods for reading and writing data to/from an
6
+ # HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
7
+ #
8
+ # Learn more about Hadoop[http://hadoop.apache.org] and the
9
+ # {Hadoop Distributed
10
+ # Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
11
+ module HDFS
12
+
13
+ # Checks to see if this is a file or directory
14
+ def self.extended obj
15
+ obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
16
+ end
17
+
18
+ # Is this resource an HDFS resource?
19
+ #
20
+ # @return [true, false]
21
+ def on_hdfs?
22
+ true
23
+ end
24
+ alias_method :is_hdfs?, :on_hdfs?
25
+
26
+ # Copy this resource to the +new_uri+.
27
+ #
28
+ # @param [String, IMW::Resource] new_uri
29
+ # @return [IMW::Resource] the new resource
30
+ def cp new_uri
31
+ IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
32
+ end
33
+
34
+ # Move this resource to the +new_uri+.
35
+ #
36
+ # @param [String, IMW::Resource] new_uri
37
+ # @return [IMW::Resource] the new resource
38
+ def mv new_uri
39
+ IMW::Transforms::Transferer.new(:mv, self, new_uri).transfer!
40
+ end
41
+
42
+ # Delete this resource from the HDFS.
43
+ #
44
+ # @option options [true,false] :skip_trash
45
+ def rm options={}
46
+ should_exist!("Cannot delete.")
47
+ args = [:rm]
48
+ args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
49
+ args << path
50
+ HDFS.fs(*args)
51
+ self
52
+ end
53
+ alias_method :rm!, :rm
54
+
55
+
56
+ # Does this path exist on the HDFS?
57
+ #
58
+ # @return [true, false]
59
+ def exist?
60
+ return @exist unless @exist.nil?
61
+ refresh!
62
+ @exist
63
+ end
64
+ alias_method :exists?, :exist?
65
+
66
+
67
+ # Return the size (in bytes) of this resource on the HDFS.
68
+ #
69
+ # This value is cached. Call +refresh+ to refresh the cache
70
+ # manually.
71
+ #
72
+ # @return [Fixnum]
73
+ def size
74
+ return @size unless @size.nil?
75
+ refresh!
76
+ should_exist!("Cannot report size")
77
+ @size
78
+ end
79
+
80
+ # Return the number of directories contained at or below this
81
+ # path on the HDFS.
82
+ #
83
+ # This value is cached. Call +refresh+ to refresh the cache
84
+ # manually.
85
+ #
86
+ # @return [Fixnum]
87
+ def num_dirs
88
+ return @num_dirs unless @num_dirs.nil?
89
+ refresh!
90
+ should_exist!("Cannot report number of directories.")
91
+ @num_dirs
92
+ end
93
+
94
+ # Return the number of files contained at or below this path
95
+ # on the HDFS.
96
+ #
97
+ # This value is cached. Call +refresh+ to refresh the cache
98
+ # manually.
99
+ #
100
+ # @return [Fixnum]
101
+ def num_files
102
+ return @num_files unless @num_files.nil?
103
+ refresh!
104
+ should_exist!("Cannot report number of files.")
105
+ @num_files
106
+ end
107
+
108
+ # Is this resource an HDFS directory?
109
+ #
110
+ # @return [true, false]
111
+ def is_directory?
112
+ exist? && num_dirs > 0
113
+ end
114
+
115
+ # Refresh the cached file properties.
116
+ #
117
+ # @return [IMW::Resource] this resource
118
+ def refresh!
119
+ response = HDFS.fs(:count, path)
120
+ if response.blank? || response =~ /^Can not find listing for/
121
+ @exist = false
122
+ @num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
123
+ else
124
+ @exist = true
125
+ parts = response.split
126
+ @num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
127
+ @hdfs_path = parts.last
128
+ end
129
+ self
130
+ end
131
+
132
+ # Execute +command+ with +args+ on the Hadoop Distributed
133
+ # Filesystem (HDFS).
134
+ #
135
+ # If passed a block, yield each line of the output from the
136
+ # command, else just return the output.
137
+ #
138
+ # Try running `hadoop fs -help' for more information.
139
+ #
140
+ # @param [String, Symbol] command the command to run.
141
+ # @param [String, Symbol] args the arguments to pass the command
142
+ # @yield [String] each line of the command's output
143
+ # @return [String] the command's output
144
+ def self.fs command, *args
145
+ command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
146
+ command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
147
+ output = `#{command_string}`.chomp
148
+ if block_given?
149
+ output.split("\n").each do |line|
150
+ yield line
151
+ end
152
+ else
153
+ output
154
+ end
155
+ end
156
+
157
+ protected
158
+ # Returns the path to the Hadoop executable.
159
+ #
160
+ # @return [String]
161
+ def self.executable
162
+ @executable ||= begin
163
+ string = `which hadoop`.chomp
164
+ raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
165
+ string
166
+ end
167
+ end
168
+ end
169
+
170
+ # Defines methods for reading data from HDFS files.
171
+ module HDFSFile
172
+
173
+ # Return the contents of this HDFS file as a string.
174
+ #
175
+ # Be VERY careful how you use this!
176
+ #
177
+ # @return [String]
178
+ def read
179
+ HDFS.fs(:cat, path)
180
+ end
181
+
182
+ # Iterate through each line of this HDFS resource.
183
+ #
184
+ # @yield [String] each line of the file
185
+ def each &block
186
+ HDFS.fs(:cat, path, &block)
187
+ end
188
+
189
+ # Return a handle on a StringIO object representing the
190
+ # content in this HDFS file.
191
+ #
192
+ # Be VERY careful how you use this! It is a StringIO object
193
+ # so the whole HDFS file is read into a string before
194
+ # returning the handle.
195
+ #
196
+ # @return [StringIO]
197
+ def io
198
+ @io ||= StringIO.new(read)
199
+ end
200
+
201
+ # Map over the lines of this HDFS resource.
202
+ #
203
+ # @yield [String] each line of the file
204
+ # @return [Array] the result of the block on each line
205
+ def map &block
206
+ returning([]) do |output|
207
+ HDFS.fs(:cat, path) do |line|
208
+ output << block.call(line)
209
+ end
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ # Defines methods for listing contents of HDFS directories.
216
+ module HDFSDirectory
217
+
218
+ # Return the paths of all files and directories directly below
219
+ # this directory on the HDFS.
220
+ #
221
+ # @return [Array<String>]
222
+ def contents
223
+ returning([]) do |paths|
224
+ HDFS.fs(:ls, path) do |line|
225
+ next if line =~ /^Found.*items$/
226
+ paths << line.split.last
227
+ end
228
+ end
229
+ end
230
+
231
+ # Return the resources directly below this directory on the
232
+ # HDFS.
233
+ #
234
+ # @return [Array<IMW::Resource>]
235
+ def resources
236
+ contents.map { |path| IMW.open(path) }
237
+ end
238
+
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,161 @@
1
+ module IMW
2
+ module Resources
3
+ module Schemes
4
+
5
+ # Defines methods for accessing a resource over HTTP. Uses
6
+ # RestClient to implement the basic HTTP verbs (GET, POST, PUT,
7
+ # DELETE, HEAD).
8
+ module HTTP
9
+
10
+ # Is this resource being accessed via HTTP?
11
+ #
12
+ # @return [true, false]
13
+ def via_http?
14
+ true
15
+ end
16
+
17
+ # Copy this resource to the +new_uri+.
18
+ #
19
+ # @param [String, IMW::Resource] new_uri
20
+ # @return [IMW::Resource] the new resource
21
+ def cp new_uri
22
+ IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
23
+ end
24
+
25
+
26
+ # Return the basename of the URI or <tt>_index</tt> if it's
27
+ # blank, as in the case of <tt>http://www.google.com</tt>.
28
+ #
29
+ # @return [String]
30
+ def effective_basename
31
+ (basename.blank? || basename =~ %r{^/*$}) ? "_index" : basename
32
+ end
33
+
34
+ # Send a GET request to this resource's URI.
35
+ #
36
+ # If the response doesn't have HTTP code 2xx, a RestClient
37
+ # error will be raised.
38
+ #
39
+ # If a block is given then the response will be passed to the
40
+ # block, even in case of a non-2xx code.
41
+ #
42
+ # See the documentation for
43
+ # RestClient[http://rdoc.info/projects/archiloque/rest-client]
44
+ # for more information.
45
+ #
46
+ # @param [Hash] headers the headers to include in the request
47
+ # @yield [RestClient::Response] the response from the server
48
+ # @return [RestClient::Response] the response from the server
49
+ # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
50
+ def get headers={}, &block
51
+ make_restclient_request do
52
+ RestClient.get(uri.to_s, headers, &block)
53
+ end
54
+ end
55
+
56
+ # Send a POST request to this resource's URI with data
57
+ # +payload+.
58
+ #
59
+ # If the response doesn't have HTTP code 2xx, a RestClient
60
+ # error will be raised.
61
+ #
62
+ # If a block is given then the response will be passed to the
63
+ # block, even in case of a non-2xx code.
64
+ #
65
+ # See the documentation for
66
+ # RestClient[http://rdoc.info/projects/archiloque/rest-client]
67
+ # for more information.
68
+ #
69
+ # @param [Hash, String] payload the data to send
70
+ # @param [Hash] headers the headers to include in the request
71
+ # @yield [RestClient::Response] the response from the server
72
+ # @return [RestClient::Response] the response from the server
73
+ # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
74
+ def post payload, headers={}, &block
75
+ make_restclient_request do
76
+ RestClient.post(uri.to_s, payload, headers, &block)
77
+ end
78
+ end
79
+
80
+ # Send a PUT request to this resource's URI with data
81
+ # +payload+.
82
+ #
83
+ # If the response doesn't have HTTP code 2xx, a RestClient
84
+ # error will be raised.
85
+ #
86
+ # If a block is given then the response will be passed to the
87
+ # block, even in case of a non-2xx code.
88
+ #
89
+ # See the documentation for
90
+ # RestClient[http://rdoc.info/projects/archiloque/rest-client]
91
+ # for more information.
92
+ #
93
+ # @param [Hash, String] payload the data to send
94
+ # @param [Hash] headers the headers to include in the request
95
+ # @yield [RestClient::Response] the response from the server
96
+ # @return [RestClient::Response] the response from the server
97
+ # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
98
+ def put payload, headers={}, &block
99
+ make_restclient_request do
100
+ RestClient.put(uri.to_s, payload, headers, &block)
101
+ end
102
+ end
103
+
104
+ # Send a DELETE request to this resource's URI.
105
+ #
106
+ # If the response doesn't have HTTP code 2xx, a RestClient
107
+ # error will be raised.
108
+ #
109
+ # If a block is given then the response will be passed to the
110
+ # block, even in case of a non-2xx code.
111
+ #
112
+ # See the documentation for
113
+ # RestClient[http://rdoc.info/projects/archiloque/rest-client]
114
+ # for more information.
115
+ #
116
+ # @param [Hash] headers the headers to include in the request
117
+ # @yield [RestClient::Response] the response from the server
118
+ # @return [RestClient::Response] the response from the server
119
+ # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
120
+ def delete headers={}, &block
121
+ make_restclient_request do
122
+ RestClient.delete(uri.to_s, headers, &block)
123
+ end
124
+ end
125
+
126
+ # Send a HEAD request to this resource's URI.
127
+ #
128
+ # If the response doesn't have HTTP code 2xx, a RestClient
129
+ # error will be raised.
130
+ #
131
+ # If a block is given then the response will be passed to the
132
+ # block, even in case of a non-2xx code.
133
+ #
134
+ # See the documentation for
135
+ # RestClient[http://rdoc.info/projects/archiloque/rest-client]
136
+ # for more information.
137
+ #
138
+ # @param [Hash] headers the headers to include in the request
139
+ # @yield [RestClient::Response] the response from the server
140
+ # @return [RestClient::Response] the response from the server
141
+ # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
142
+ def head headers={}, &block
143
+ make_restclient_request do
144
+ RestClient.head(uri.to_s, headers, &block)
145
+ end
146
+ end
147
+
148
+ protected
149
+ def make_restclient_request &block # :nodoc
150
+ require 'restclient'
151
+ begin
152
+ yield
153
+ rescue RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed => e
154
+ raise IMW::NetworkError.new("#{e.class} -- #{e.message}")
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
161
+
@@ -0,0 +1,137 @@
1
+ module IMW
2
+ module Resources
3
+ module Schemes
4
+
5
+ # Defines methods for reading and writing data to {Amazon
6
+ # S3}[http://aws.amazon.com/s3] buckets.
7
+ #
8
+ # IMW.open('s3://my_bucket/path/to/some/file.csv')
9
+ #
10
+ # Learn more about {Amazon Web Services}[http://aws.amazon.com].
11
+ module S3
12
+
13
+ # For an S3 resource, the bucket is just the hostname.
14
+ #
15
+ # @return [String]
16
+ def bucket
17
+ host
18
+ end
19
+
20
+ # Is this resource an S3 resource?
21
+ #
22
+ # @return [true, false]
23
+ def on_s3?
24
+ true
25
+ end
26
+ alias_method :is_s3?, :on_s3?
27
+
28
+ # Copy this resource to the +new_uri+.
29
+ #
30
+ # @param [String, IMW::Resource] new_uri
31
+ # @return [IMW::Resource] the new resource
32
+ def cp new_uri
33
+ IMW::Transforms::Transferer.new(:cp, self, new_uri).transfer!
34
+ end
35
+
36
+ # The AWS::S3::S3Object corresponding to this resource.
37
+ def s3_object
38
+ self.class.make_connection!
39
+ @s3_object ||= AWS::S3::S3Object.new(path, bucket)
40
+ end
41
+
42
+ # Does this resource exist on S3?
43
+ #
44
+ # @return [true, false]
45
+ def exist?
46
+ s3_object.exists?
47
+ end
48
+ alias_method :exists?, :exist?
49
+
50
+ # Remove this resource from S3.
51
+ #
52
+ # @return [IMW::Resource] the deleted object
53
+ def rm
54
+ s3_object.delete
55
+ end
56
+ alias_method :rm!, :rm
57
+
58
+ # Return the S3N URL for this S3 object
59
+ #
60
+ # resource = IMW.open('s3://my_bucket/path/to/some/obj')
61
+ # resource.s3n_url
62
+ # => 's3n://my_bucket/path/to/some/obj'
63
+ #
64
+ # @return [String]
65
+ def s3n_url
66
+ uri.to_s.gsub(/^s3:/, 's3n:')
67
+ end
68
+
69
+ # Return the contents of this S3 object.
70
+ #
71
+ # @return [String]
72
+ def read
73
+ s3_object.value
74
+ end
75
+
76
+ # Store +source+ into +destination+.
77
+ #
78
+ # @param [String, IMW::Resource, #io] source
79
+ # @param [String, IMW::Resource, #path, #bucket] destination
80
+ # @return [IMW::Resource] the new S3 object
81
+ def self.put source, destination
82
+ source = IMW.open(source)
83
+ destintation = IMW.open(destination)
84
+ raise IMW::ArgumentError.new("destination must be on S3 -- #{destination.uri} given") unless destination.on_s3?
85
+ make_connection!
86
+ AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
87
+ destination
88
+ end
89
+
90
+ # Download +source+ from S3 into +destination+.
91
+ #
92
+ # @param [String, IMW::Resource, #path, #bucket] source
93
+ # @param [String, IMW::Resource, #write] destination
94
+ # @return [IMW::Resource] the new resource
95
+ def self.get source, destination
96
+ source = IMW.open(source)
97
+ destination = IMW.open(destination)
98
+ make_connection!
99
+ AWS::S3::Object.stream(source.path, source.bucket) do |chunk|
100
+ destination.write(chunk)
101
+ end
102
+ destination.close
103
+ destination.reopen
104
+ end
105
+
106
+ # Copy S3 resource +source+ to +destination+.
107
+ #
108
+ # @param [String, IMW::Resource, #path, #bucket] source
109
+ # @param [String, IMW::Resource, #path, #bucket] destination
110
+ # @return [IMW::Resource] the new resource
111
+ def self.copy source, destination
112
+ source = IMW.open(source)
113
+ destination = IMW.open(destination)
114
+ raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
115
+ make_connection!
116
+ AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
117
+ destination
118
+ end
119
+
120
+ protected
121
+ # Make an S3 connection.
122
+ #
123
+ # Uses settings defined in IMW::AWS_CREDENTIALS.
124
+ #
125
+ # @return [AWS
126
+ def self.make_connection!
127
+ return @connection if @connection
128
+ raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
129
+ require 'aws/s3'
130
+ @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
131
+ end
132
+
133
+ end
134
+ end
135
+ end
136
+ end
137
+
@@ -0,0 +1,19 @@
1
+ module IMW
2
+ module Resources
3
+ module Schemes
4
+ autoload :S3, 'imw/resources/schemes/s3'
5
+ autoload :HTTP, 'imw/resources/schemes/http'
6
+ autoload :HTTPS, 'imw/resources/schemes/http'
7
+ autoload :HDFS, 'imw/resources/schemes/hdfs'
8
+
9
+ # Handlers which extend a resource with scheme specific methods.
10
+ SCHEME_HANDLERS = [
11
+ ["Schemes::S3", %r{^s3://} ],
12
+ ["Schemes::HTTP", %r{^http://} ],
13
+ ["Schemes::HTTPS", %r{^https://} ],
14
+ ["Schemes::HDFS", %r{^hdfs://} ]
15
+ ]
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,118 @@
1
+ require 'imw/resources/formats'
2
+ require 'imw/resources/schemes'
3
+ require 'imw/resources/archives_and_compressed'
4
+
5
+ module IMW
6
+
7
+ # IMW::Resources is a namespace in which all the modules which
8
+ # define different kinds of behavior for IMW::Resource objects are
9
+ # defined.
10
+ #
11
+ # When an IMW::Resource is instantiated it eventually calls
12
+ # IMW::Resources#extend_resource! which will iterate through the
13
+ # handlers in IMW::Resources#handlers, extending the resource with
14
+ # modules whose handler conditions are satisfied.
15
+ #
16
+ # A handler is just an Array with two elements. The first should be
17
+ # a module or a string identifying a module.
18
+ #
19
+ # If the second element is a Regexp, the corresponding module will
20
+ # be used if the regexp matches the resource's URI (as a string)
21
+ #
22
+ # If the second element is a Proc, it will be called with the
23
+ # resource as its only argument and if it returns true then the
24
+ # module will be used.
25
+ #
26
+ # You can define your own handlers by appending them to
27
+ # IMW::Resources::USER_DEFINED_HANDLERS in your <tt>.imwrc</tt>
28
+ # file.
29
+ module Resources
30
+
31
+ autoload :LocalObj, 'imw/resources/local'
32
+ autoload :RemoteObj, 'imw/resources/remote'
33
+ autoload :StringObj, 'imw/resources/string'
34
+ autoload :Transferable, 'imw/resources/transferable'
35
+
36
+ # Iterate through IMW::Resources#handlers and extend the given
37
+ # +resource+ with modules whose handler conditions match the
38
+ # resource.
39
+ #
40
+ # @param [IMW::Resource] resource the resource to extend
41
+ # @return [IMW::Resource] the extended resource
42
+ def self.extend_resource! resource
43
+ handlers.each do |mod_name, handler|
44
+ case handler
45
+ when Regexp then extend_resource_with_mod_or_string!(resource, mod_name) if handler =~ resource.uri.to_s
46
+ when Proc then extend_resource_with_mod_or_string!(resource, mod_name) if handler.call(resource)
47
+ when TrueClass then extend_resource_with_mod_or_string!(resource, mod_name)
48
+ else
49
+ raise IMW::TypeError("A handler must be Regexp, Proc, or true")
50
+ end
51
+ end
52
+ resource
53
+ end
54
+
55
+ # Basic handlers to determine whether the resource is local,
56
+ # remote, or a string.
57
+ BASIC_HANDLERS = [
58
+ ["LocalObj", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
59
+ ["RemoteObj", Proc.new { |resource| resource.scheme != 'file' && resource.scheme.present? } ],
60
+ ["StringObj", Proc.new { |resource| resource.is_stringio? } ]
61
+ ]
62
+
63
+ # Define this constant in your configuration file to add your own
64
+ # handlers.
65
+ USER_DEFINED_HANDLERS = [] unless defined?(USER_DEFINED_HANDLERS)
66
+
67
+ # include handlers from other modules
68
+ include IMW::Resources::Formats
69
+ include IMW::Resources::Schemes
70
+
71
+ # A list of handlers to try. Define your own handlers in
72
+ # IMW::Resources::USER_DEFINED_HANDLERS.
73
+ #
74
+ # @return [Array]
75
+ def self.handlers
76
+ # order here is important
77
+ BASIC_HANDLERS + SCHEME_HANDLERS + ARCHIVE_AND_COMPRESSED_HANDLERS + FORMAT_HANDLERS + USER_DEFINED_HANDLERS
78
+ end
79
+
80
+ protected
81
+
82
+ # Extend +resource+ with +mod_or_string+. Will work hard to try
83
+ # and interpret +mod_or_string+ as a module if it's a string.
84
+ #
85
+ # @param [IMW::Resource] resource the resource to extend
86
+ #
87
+ # @param [Module, String] mod_or_string the module or string
88
+ # representing a module to extend the resource with
89
+ def self.extend_resource_with_mod_or_string! resource, mod_or_string
90
+ if mod_or_string.is_a?(Module)
91
+ resource.extend(mod_or_string)
92
+ else
93
+ # Given a string "Mod::SubMod::SubSubMod" first split it into
94
+ # its parts ["Mod", "SubMod", "SubSubMod"] and then begin
95
+ # class_eval'ing them in order so that each is class_eval'd in
96
+ # the scope of the one before it.
97
+ #
98
+ # There is almost certainly a better way to do this.
99
+ mod_names = mod_or_string.to_s.split('::')
100
+ mods = []
101
+ mod_names.each_with_index do |name, index|
102
+ if index == 0
103
+ mods << class_eval(name)
104
+ else
105
+ begin
106
+ mods << class_eval(name)
107
+ rescue NameError
108
+ mods << mods[index - 1].class_eval(name)
109
+ end
110
+ end
111
+ end
112
+ resource.extend(mods.last)
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+