swineherd 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +201 -0
- data/README.textile +207 -0
- data/Rakefile +30 -0
- data/VERSION +1 -0
- data/bin/hadoop-stream +35 -0
- data/bin/hdp-tree +26 -0
- data/examples/pagerank/data/seinfeld_network.tsv +429 -0
- data/examples/pagerank/pagerank.rb +99 -0
- data/examples/pagerank/scripts/cut_off_list.rb +16 -0
- data/examples/pagerank/scripts/histogram.R +5 -0
- data/examples/pagerank/scripts/pagerank.pig +20 -0
- data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
- data/lib/swineherd.rb +11 -0
- data/lib/swineherd/filesystem.rb +26 -0
- data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
- data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
- data/lib/swineherd/filesystem/filesystems.rb +103 -0
- data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
- data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
- data/lib/swineherd/filesystem/localfs.rb +11 -0
- data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
- data/lib/swineherd/script.rb +74 -0
- data/lib/swineherd/script/hadoop_script.rb +59 -0
- data/lib/swineherd/script/pig_script.rb +46 -0
- data/lib/swineherd/script/r_script.rb +14 -0
- data/lib/swineherd/script/wukong_script.rb +31 -0
- data/lib/swineherd/template.rb +45 -0
- data/lib/swineherd/workflow.rb +53 -0
- data/lib/swineherd/workflow/job.rb +60 -0
- data/notes.txt +20 -0
- data/swineherd.gemspec +97 -0
- data/tests/test_filesystem.rb +105 -0
- data/tests/test_s3_filesystem.rb +132 -0
- data/tests/testcfg.yaml +7 -0
- metadata +204 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
module Swineherd
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for dealing with hadoop distributed file system (hdfs). This class
|
5
|
+
# requires that you run with JRuby as it makes use of the native java hadoop
|
6
|
+
# libraries.
|
7
|
+
#
|
8
|
+
class HadoopFileSystem
|
9
|
+
|
10
|
+
include Swineherd::BaseFileSystem
|
11
|
+
|
12
|
+
attr_accessor :conf, :hdfs
|
13
|
+
|
14
|
+
#
|
15
|
+
# Initialize a new hadoop file system, needs path to hadoop configuration
|
16
|
+
#
|
17
|
+
def initialize *args
|
18
|
+
check_and_set_environment
|
19
|
+
@conf = Java::org.apache.hadoop.conf.Configuration.new
|
20
|
+
@hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Make sure environment is sane then set up environment for use
|
25
|
+
#
|
26
|
+
def check_and_set_environment
|
27
|
+
check_env
|
28
|
+
set_env
|
29
|
+
end
|
30
|
+
|
31
|
+
def open path, mode="r", &blk
|
32
|
+
HadoopFile.new(path,mode,self,&blk)
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def rm path
|
37
|
+
@hdfs.delete(Path.new(path), true)
|
38
|
+
[path]
|
39
|
+
end
|
40
|
+
|
41
|
+
def exists? path
|
42
|
+
@hdfs.exists(Path.new(path))
|
43
|
+
end
|
44
|
+
|
45
|
+
def mv srcpath, dstpath
|
46
|
+
@hdfs.rename(Path.new(srcpath), Path.new(dstpath))
|
47
|
+
end
|
48
|
+
|
49
|
+
def cp srcpath, dstpath
|
50
|
+
FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
|
51
|
+
end
|
52
|
+
|
53
|
+
def mkpath path
|
54
|
+
@hdfs.mkdirs(Path.new(path))
|
55
|
+
path
|
56
|
+
end
|
57
|
+
|
58
|
+
def type path
|
59
|
+
return "unknown" unless exists? path
|
60
|
+
status = @hdfs.get_file_status(Path.new(path))
|
61
|
+
return "directory" if status.is_dir?
|
62
|
+
"file"
|
63
|
+
# case
|
64
|
+
# when status.isFile then
|
65
|
+
# return "file"
|
66
|
+
# when status.is_directory? then
|
67
|
+
# return "directory"
|
68
|
+
# when status.is_symlink? then
|
69
|
+
# return "symlink"
|
70
|
+
# end
|
71
|
+
end
|
72
|
+
|
73
|
+
def entries dirpath
|
74
|
+
return unless type(dirpath) == "directory"
|
75
|
+
list = @hdfs.list_status(Path.new(dirpath))
|
76
|
+
list.map{|path| path.get_path.to_s} rescue []
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Merge all part files in a directory into one file.
|
81
|
+
#
|
82
|
+
def merge srcdir, dstfile
|
83
|
+
FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# This is hackety. Use with caution.
|
88
|
+
#
|
89
|
+
def stream input, output
|
90
|
+
require 'uri'
|
91
|
+
input_fs_scheme = URI.parse(input).scheme
|
92
|
+
output_fs_scheme = URI.parse(output).scheme
|
93
|
+
system("#{@hadoop_home}/bin/hadoop \\
|
94
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
95
|
+
-D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
|
96
|
+
-D mapred.min.split.size=1000000000 \\
|
97
|
+
-D mapred.reduce.tasks=0 \\
|
98
|
+
-mapper \"/bin/cat\" \\
|
99
|
+
-input \"#{input}\" \\
|
100
|
+
-output \"#{output}\"")
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# BZIP
|
105
|
+
#
|
106
|
+
def bzip input, output
|
107
|
+
system("#{@hadoop_home}/bin/hadoop \\
|
108
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
109
|
+
-D mapred.output.compress=true \\
|
110
|
+
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
|
111
|
+
-D mapred.reduce.tasks=1 \\
|
112
|
+
-mapper \"/bin/cat\" \\
|
113
|
+
-reducer \"/bin/cat\" \\
|
114
|
+
-input \"#{input}\" \\
|
115
|
+
-output \"#{output}\"")
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Copy hdfs file to local filesystem
|
120
|
+
#
|
121
|
+
def copy_to_local srcfile, dstfile
|
122
|
+
@hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Copyy local file to hdfs filesystem
|
127
|
+
#
|
128
|
+
def copy_from_local srcfile, dstfile
|
129
|
+
@hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
|
130
|
+
end
|
131
|
+
|
132
|
+
def close *args
|
133
|
+
@hdfs.close
|
134
|
+
end
|
135
|
+
|
136
|
+
class HadoopFile
|
137
|
+
attr_accessor :path, :handle, :hdfs
|
138
|
+
|
139
|
+
#
|
140
|
+
# In order to open input and output streams we must pass around the hadoop fs object itself
|
141
|
+
#
|
142
|
+
def initialize path, mode, fs, &blk
|
143
|
+
@fs = fs
|
144
|
+
@path = Path.new(path)
|
145
|
+
case mode
|
146
|
+
when "r" then
|
147
|
+
raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
|
148
|
+
@handle = @fs.hdfs.open(@path).to_io(&blk)
|
149
|
+
when "w" then
|
150
|
+
# Open path for writing
|
151
|
+
raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
|
152
|
+
@handle = @fs.hdfs.create(@path).to_io.to_outputstream
|
153
|
+
if block_given?
|
154
|
+
yield self
|
155
|
+
self.close # muy muy importante
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def read
|
161
|
+
@handle.read
|
162
|
+
end
|
163
|
+
|
164
|
+
def readline
|
165
|
+
@handle.readline
|
166
|
+
end
|
167
|
+
|
168
|
+
def write string
|
169
|
+
@handle.write(string.to_java_string.get_bytes)
|
170
|
+
end
|
171
|
+
|
172
|
+
def puts string
|
173
|
+
write(string+"\n")
|
174
|
+
end
|
175
|
+
|
176
|
+
def close
|
177
|
+
@handle.close
|
178
|
+
end
|
179
|
+
|
180
|
+
end
|
181
|
+
|
182
|
+
# #
|
183
|
+
# # Distributed streaming from input to output
|
184
|
+
# #
|
185
|
+
#
|
186
|
+
# #
|
187
|
+
# # Given an array of input dirs, stream all into output dir and remove duplicate records.
|
188
|
+
# # Reasonable default hadoop streaming options are chosen.
|
189
|
+
# #
|
190
|
+
# def self.merge inputs, output, options = {}
|
191
|
+
# options[:reduce_tasks] ||= 25
|
192
|
+
# options[:partition_fields] ||= 2
|
193
|
+
# options[:sort_fields] ||= 2
|
194
|
+
# options[:field_separator] ||= '/t'
|
195
|
+
# names = inputs.map{|inp| File.basename(inp)}.join(',')
|
196
|
+
# cmd = "${HADOOP_HOME}/bin/hadoop \\
|
197
|
+
# jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \\
|
198
|
+
# -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
|
199
|
+
# -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
|
200
|
+
# -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
|
201
|
+
# -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
|
202
|
+
# -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
|
203
|
+
# -D mapred.min.split.size=1000000000 \\
|
204
|
+
# -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
|
205
|
+
# -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
|
206
|
+
# -mapper \"/bin/cat\" \\
|
207
|
+
# -reducer \"/usr/bin/uniq\" \\
|
208
|
+
# -input \"#{inputs.join(',')}\" \\
|
209
|
+
# -output \"#{output}\""
|
210
|
+
# puts cmd
|
211
|
+
# system cmd
|
212
|
+
# end
|
213
|
+
#
|
214
|
+
# #
|
215
|
+
# # Concatenates a hadoop dir or file into a local file
|
216
|
+
# #
|
217
|
+
# def self.cat_to_local src, dest
|
218
|
+
# system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
|
219
|
+
# end
|
220
|
+
#
|
221
|
+
|
222
|
+
#
|
223
|
+
# Check that we are running with jruby, check for hadoop home. hadoop_home
|
224
|
+
# is preferentially set to the HADOOP_HOME environment variable if it's set,
|
225
|
+
# '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
|
226
|
+
# '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
|
227
|
+
# fails inform the user that HADOOP_HOME really should be set.
|
228
|
+
#
|
229
|
+
def check_env
|
230
|
+
begin
|
231
|
+
require 'java'
|
232
|
+
rescue LoadError => e
|
233
|
+
raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
|
234
|
+
end
|
235
|
+
@hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
|
236
|
+
@hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
|
237
|
+
raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
|
238
|
+
end
|
239
|
+
|
240
|
+
#
|
241
|
+
# Place hadoop jars in class path, require appropriate jars, set hadoop conf
|
242
|
+
#
|
243
|
+
def set_env
|
244
|
+
require 'java'
|
245
|
+
@hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
|
246
|
+
@hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
|
247
|
+
$CLASSPATH << @hadoop_conf
|
248
|
+
Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
|
249
|
+
|
250
|
+
java_import 'org.apache.hadoop.conf.Configuration'
|
251
|
+
java_import 'org.apache.hadoop.fs.Path'
|
252
|
+
java_import 'org.apache.hadoop.fs.FileSystem'
|
253
|
+
java_import 'org.apache.hadoop.fs.FileUtil'
|
254
|
+
java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
|
255
|
+
java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
|
256
|
+
java_import 'org.apache.hadoop.fs.FSDataOutputStream'
|
257
|
+
java_import 'org.apache.hadoop.fs.FSDataInputStream'
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
module Swineherd
|
3
|
+
|
4
|
+
class LocalFileSystem
|
5
|
+
|
6
|
+
include Swineherd::BaseFileSystem
|
7
|
+
|
8
|
+
def initialize *args
|
9
|
+
end
|
10
|
+
|
11
|
+
def open path, mode="r", &blk
|
12
|
+
return LocalFile.new path, mode, &blk
|
13
|
+
end
|
14
|
+
|
15
|
+
def rm path
|
16
|
+
FileUtils.rm_r path
|
17
|
+
end
|
18
|
+
|
19
|
+
def exists? path
|
20
|
+
File.exists?(path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def mv srcpath, dstpath
|
24
|
+
FileUtils.mv(srcpath,dstpath)
|
25
|
+
end
|
26
|
+
|
27
|
+
def cp srcpath, dstpath
|
28
|
+
FileUtils.cp_r(srcpath,dstpath)
|
29
|
+
end
|
30
|
+
|
31
|
+
def mkpath path
|
32
|
+
FileUtils.mkpath path
|
33
|
+
end
|
34
|
+
|
35
|
+
def type path
|
36
|
+
case
|
37
|
+
when File.symlink?(path) then
|
38
|
+
return "symlink"
|
39
|
+
when File.directory?(path) then
|
40
|
+
return "directory"
|
41
|
+
when File.file?(path) then
|
42
|
+
return "file"
|
43
|
+
end
|
44
|
+
"unknown"
|
45
|
+
end
|
46
|
+
|
47
|
+
def entries dirpath
|
48
|
+
return unless (type(dirpath) == "directory")
|
49
|
+
Dir.entries(dirpath)
|
50
|
+
end
|
51
|
+
|
52
|
+
class LocalFile
|
53
|
+
attr_accessor :path, :scheme, :handle, :mode
|
54
|
+
|
55
|
+
def initialize path, mode="r", &blk
|
56
|
+
@path = path
|
57
|
+
@mode = mode
|
58
|
+
@handle = File.open(path,mode,&blk)
|
59
|
+
end
|
60
|
+
|
61
|
+
def open path, mode="r", &blk
|
62
|
+
initialize(path,mode,&blk)
|
63
|
+
end
|
64
|
+
|
65
|
+
def read
|
66
|
+
@handle.read
|
67
|
+
end
|
68
|
+
|
69
|
+
def readline
|
70
|
+
@handle.gets
|
71
|
+
end
|
72
|
+
|
73
|
+
def write string
|
74
|
+
@handle.write(string)
|
75
|
+
end
|
76
|
+
|
77
|
+
def close
|
78
|
+
@handle.close
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Swineherd
|
2
|
+
class LocalFS
|
3
|
+
def self.check_paths paths
|
4
|
+
exist_count = 0 # no outputs exist
|
5
|
+
paths.each{|path| exist_count += 1 if File.exist?(path) }
|
6
|
+
raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
|
7
|
+
return true if exist_count == 0
|
8
|
+
false
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
module Swineherd
|
3
|
+
|
4
|
+
#
|
5
|
+
# Methods for interacting with Amazon's Simple Store Service (s3).
|
6
|
+
#
|
7
|
+
class S3FileSystem
|
8
|
+
|
9
|
+
include Swineherd::BaseFileSystem
|
10
|
+
|
11
|
+
attr_accessor :s3
|
12
|
+
|
13
|
+
#
|
14
|
+
# Initialize a new s3 file system, needs path to aws keys
|
15
|
+
#
|
16
|
+
def initialize aws_access_key_id, aws_secret_access_key
|
17
|
+
require 'right_aws'
|
18
|
+
@s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
|
19
|
+
end
|
20
|
+
|
21
|
+
def open path, mode="r", &blk
|
22
|
+
S3File.new(path,mode,self,&blk)
|
23
|
+
end
|
24
|
+
|
25
|
+
def rm path
|
26
|
+
bkt = bucket(path)
|
27
|
+
key = key_path(path)
|
28
|
+
if key.empty? # only the bucket was passed in, delete it
|
29
|
+
@s3.interface.force_delete_bucket(bkt)
|
30
|
+
else
|
31
|
+
case type(path)
|
32
|
+
when "directory" then
|
33
|
+
keys_to_delete = lr(path)
|
34
|
+
keys_to_delete.each do |k|
|
35
|
+
key_to_delete = key_path(k)
|
36
|
+
@s3.interface.delete(bkt, key_to_delete)
|
37
|
+
end
|
38
|
+
keys_to_delete
|
39
|
+
when "file" then
|
40
|
+
@s3.interface.delete(bkt, key)
|
41
|
+
[path]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def bucket path
|
47
|
+
uri = URI.parse(path)
|
48
|
+
uri.path.split('/').reject{|x| x.empty?}.first
|
49
|
+
end
|
50
|
+
|
51
|
+
def key_path path
|
52
|
+
uri = URI.parse(path)
|
53
|
+
File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
|
54
|
+
end
|
55
|
+
|
56
|
+
def needs_trailing_slash pre
|
57
|
+
has_trailing_slash = pre.end_with? '/'
|
58
|
+
is_empty_prefix = pre.empty?
|
59
|
+
!(has_trailing_slash || is_empty_prefix)
|
60
|
+
end
|
61
|
+
|
62
|
+
def full_contents path
|
63
|
+
bkt = bucket(path)
|
64
|
+
pre = key_path(path)
|
65
|
+
pre += '/' if needs_trailing_slash(pre)
|
66
|
+
contents = []
|
67
|
+
s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
|
68
|
+
contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
|
69
|
+
contents += res[:contents].map{|c| File.join(bkt, c[:key])}
|
70
|
+
end
|
71
|
+
contents
|
72
|
+
end
|
73
|
+
|
74
|
+
def exists? path
|
75
|
+
object = File.basename(path)
|
76
|
+
search_dir = File.dirname(path)
|
77
|
+
case search_dir
|
78
|
+
when '.' then # only a bucket was passed in
|
79
|
+
begin
|
80
|
+
(full_contents(object).size > 0)
|
81
|
+
rescue RightAws::AwsError => e
|
82
|
+
if e.message =~ /nosuchbucket/i
|
83
|
+
false
|
84
|
+
else
|
85
|
+
raise e
|
86
|
+
end
|
87
|
+
end
|
88
|
+
else
|
89
|
+
search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
|
90
|
+
search_dir_contents.include?(object)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def mv srcpath, dstpath
|
95
|
+
src_bucket = bucket(srcpath)
|
96
|
+
dst_bucket = bucket(dstpath)
|
97
|
+
dst_key_path = key_path(dstpath)
|
98
|
+
mkpath(dstpath)
|
99
|
+
case type(srcpath)
|
100
|
+
when "directory" then
|
101
|
+
paths_to_copy = lr(srcpath)
|
102
|
+
common_dir = common_directory(paths_to_copy)
|
103
|
+
paths_to_copy.each do |path|
|
104
|
+
src_key = key_path(path)
|
105
|
+
dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
|
106
|
+
@s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
|
107
|
+
end
|
108
|
+
when "file" then
|
109
|
+
@s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def cp srcpath, dstpath
|
114
|
+
src_bucket = bucket(srcpath)
|
115
|
+
dst_bucket = bucket(dstpath)
|
116
|
+
dst_key_path = key_path(dstpath)
|
117
|
+
mkpath(dstpath)
|
118
|
+
case type(srcpath)
|
119
|
+
when "directory" then
|
120
|
+
paths_to_copy = lr(srcpath)
|
121
|
+
common_dir = common_directory(paths_to_copy)
|
122
|
+
paths_to_copy.each do |path|
|
123
|
+
src_key = key_path(path)
|
124
|
+
dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
|
125
|
+
@s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
|
126
|
+
end
|
127
|
+
when "file" then
|
128
|
+
@s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# This is a bit funny, there's actually no need to create a 'path' since
|
134
|
+
# s3 is nothing more than a glorified key-value store. When you create a
|
135
|
+
# 'file' (key) the 'path' will be created for you. All we do here is create
|
136
|
+
# the bucket unless it already exists.
|
137
|
+
#
|
138
|
+
def mkpath path
|
139
|
+
bkt = bucket(path)
|
140
|
+
key = key_path(path)
|
141
|
+
if key.empty?
|
142
|
+
@s3.interface.create_bucket(bkt)
|
143
|
+
else
|
144
|
+
@s3.interface.create_bucket(bkt) unless exists? bkt
|
145
|
+
end
|
146
|
+
path
|
147
|
+
end
|
148
|
+
|
149
|
+
def type path
|
150
|
+
return "unknown" unless exists? path
|
151
|
+
return "directory" if full_contents(path).size > 0
|
152
|
+
"file"
|
153
|
+
end
|
154
|
+
|
155
|
+
def entries dirpath
|
156
|
+
return unless type(dirpath) == "directory"
|
157
|
+
full_contents(dirpath)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Recursively list paths
|
161
|
+
def lr path
|
162
|
+
paths = entries(path)
|
163
|
+
if paths
|
164
|
+
paths.map{|e| lr(e)}.flatten
|
165
|
+
else
|
166
|
+
path
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# Ick.
|
172
|
+
#
|
173
|
+
def common_directory paths
|
174
|
+
dirs = paths.map{|path| path.split('/')}
|
175
|
+
min_size = dirs.map{|splits| splits.size}.min
|
176
|
+
dirs.map!{|splits| splits[0...min_size]}
|
177
|
+
uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
|
178
|
+
dirs[0][0...uncommon_idx].join('/')
|
179
|
+
end
|
180
|
+
|
181
|
+
def close *args
|
182
|
+
end
|
183
|
+
|
184
|
+
class S3File
|
185
|
+
attr_accessor :path, :handle, :fs
|
186
|
+
|
187
|
+
#
|
188
|
+
# In order to open input and output streams we must pass around the s3 fs object itself
|
189
|
+
#
|
190
|
+
def initialize path, mode, fs, &blk
|
191
|
+
@fs = fs
|
192
|
+
@path = path
|
193
|
+
case mode
|
194
|
+
when "r" then
|
195
|
+
raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
|
196
|
+
when "w" then
|
197
|
+
raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
|
198
|
+
@handle = Tempfile.new('s3filestream')
|
199
|
+
if block_given?
|
200
|
+
yield self
|
201
|
+
close
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
#
|
207
|
+
# Faster than iterating
|
208
|
+
#
|
209
|
+
def read
|
210
|
+
resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
|
211
|
+
resp
|
212
|
+
end
|
213
|
+
|
214
|
+
#
|
215
|
+
# This is a little hackety. That is, once you call (.each) on the object the full object starts
|
216
|
+
# downloading...
|
217
|
+
#
|
218
|
+
def readline
|
219
|
+
@handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
|
220
|
+
begin
|
221
|
+
@handle.next
|
222
|
+
rescue StopIteration, NoMethodError
|
223
|
+
@handle = nil
|
224
|
+
raise EOFError.new("end of file reached")
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def write string
|
229
|
+
@handle.write(string)
|
230
|
+
end
|
231
|
+
|
232
|
+
def puts string
|
233
|
+
write(string+"\n")
|
234
|
+
end
|
235
|
+
|
236
|
+
def close
|
237
|
+
if @handle
|
238
|
+
@handle.read
|
239
|
+
fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
|
240
|
+
@handle.close
|
241
|
+
end
|
242
|
+
@handle = nil
|
243
|
+
end
|
244
|
+
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|