swineherd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ module Swineherd
2
+
3
+ #
4
+ # Methods for dealing with hadoop distributed file system (hdfs). This class
5
+ # requires that you run with JRuby as it makes use of the native java hadoop
6
+ # libraries.
7
+ #
8
+ class HadoopFileSystem
9
+
10
+ include Swineherd::BaseFileSystem
11
+
12
+ attr_accessor :conf, :hdfs
13
+
14
+ #
15
+ # Initialize a new hadoop file system, needs path to hadoop configuration
16
+ #
17
+ def initialize *args
18
+ check_and_set_environment
19
+ @conf = Java::org.apache.hadoop.conf.Configuration.new
20
+ @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
21
+ end
22
+
23
+ #
24
+ # Make sure environment is sane then set up environment for use
25
+ #
26
+ def check_and_set_environment
27
+ check_env
28
+ set_env
29
+ end
30
+
31
+ def open path, mode="r", &blk
32
+ HadoopFile.new(path,mode,self,&blk)
33
+ end
34
+
35
+
36
+ def rm path
37
+ @hdfs.delete(Path.new(path), true)
38
+ [path]
39
+ end
40
+
41
+ def exists? path
42
+ @hdfs.exists(Path.new(path))
43
+ end
44
+
45
+ def mv srcpath, dstpath
46
+ @hdfs.rename(Path.new(srcpath), Path.new(dstpath))
47
+ end
48
+
49
+ def cp srcpath, dstpath
50
+ FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
51
+ end
52
+
53
+ def mkpath path
54
+ @hdfs.mkdirs(Path.new(path))
55
+ path
56
+ end
57
+
58
+ def type path
59
+ return "unknown" unless exists? path
60
+ status = @hdfs.get_file_status(Path.new(path))
61
+ return "directory" if status.is_dir?
62
+ "file"
63
+ # case
64
+ # when status.isFile then
65
+ # return "file"
66
+ # when status.is_directory? then
67
+ # return "directory"
68
+ # when status.is_symlink? then
69
+ # return "symlink"
70
+ # end
71
+ end
72
+
73
+ def entries dirpath
74
+ return unless type(dirpath) == "directory"
75
+ list = @hdfs.list_status(Path.new(dirpath))
76
+ list.map{|path| path.get_path.to_s} rescue []
77
+ end
78
+
79
+ #
80
+ # Merge all part files in a directory into one file.
81
+ #
82
+ def merge srcdir, dstfile
83
+ FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
84
+ end
85
+
86
+ #
87
+ # This is hackety. Use with caution.
88
+ #
89
+ def stream input, output
90
+ require 'uri'
91
+ input_fs_scheme = URI.parse(input).scheme
92
+ output_fs_scheme = URI.parse(output).scheme
93
+ system("#{@hadoop_home}/bin/hadoop \\
94
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
95
+ -D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
96
+ -D mapred.min.split.size=1000000000 \\
97
+ -D mapred.reduce.tasks=0 \\
98
+ -mapper \"/bin/cat\" \\
99
+ -input \"#{input}\" \\
100
+ -output \"#{output}\"")
101
+ end
102
+
103
+ #
104
+ # BZIP
105
+ #
106
+ def bzip input, output
107
+ system("#{@hadoop_home}/bin/hadoop \\
108
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
109
+ -D mapred.output.compress=true \\
110
+ -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
111
+ -D mapred.reduce.tasks=1 \\
112
+ -mapper \"/bin/cat\" \\
113
+ -reducer \"/bin/cat\" \\
114
+ -input \"#{input}\" \\
115
+ -output \"#{output}\"")
116
+ end
117
+
118
+ #
119
+ # Copy hdfs file to local filesystem
120
+ #
121
+ def copy_to_local srcfile, dstfile
122
+ @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
123
+ end
124
+
125
+ #
126
+ # Copyy local file to hdfs filesystem
127
+ #
128
+ def copy_from_local srcfile, dstfile
129
+ @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
130
+ end
131
+
132
+ def close *args
133
+ @hdfs.close
134
+ end
135
+
136
+ class HadoopFile
137
+ attr_accessor :path, :handle, :hdfs
138
+
139
+ #
140
+ # In order to open input and output streams we must pass around the hadoop fs object itself
141
+ #
142
+ def initialize path, mode, fs, &blk
143
+ @fs = fs
144
+ @path = Path.new(path)
145
+ case mode
146
+ when "r" then
147
+ raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
148
+ @handle = @fs.hdfs.open(@path).to_io(&blk)
149
+ when "w" then
150
+ # Open path for writing
151
+ raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
152
+ @handle = @fs.hdfs.create(@path).to_io.to_outputstream
153
+ if block_given?
154
+ yield self
155
+ self.close # muy muy importante
156
+ end
157
+ end
158
+ end
159
+
160
+ def read
161
+ @handle.read
162
+ end
163
+
164
+ def readline
165
+ @handle.readline
166
+ end
167
+
168
+ def write string
169
+ @handle.write(string.to_java_string.get_bytes)
170
+ end
171
+
172
+ def puts string
173
+ write(string+"\n")
174
+ end
175
+
176
+ def close
177
+ @handle.close
178
+ end
179
+
180
+ end
181
+
182
+ # #
183
+ # # Distributed streaming from input to output
184
+ # #
185
+ #
186
+ # #
187
+ # # Given an array of input dirs, stream all into output dir and remove duplicate records.
188
+ # # Reasonable default hadoop streaming options are chosen.
189
+ # #
190
+ # def self.merge inputs, output, options = {}
191
+ # options[:reduce_tasks] ||= 25
192
+ # options[:partition_fields] ||= 2
193
+ # options[:sort_fields] ||= 2
194
+ # options[:field_separator] ||= '/t'
195
+ # names = inputs.map{|inp| File.basename(inp)}.join(',')
196
+ # cmd = "${HADOOP_HOME}/bin/hadoop \\
197
+ # jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \\
198
+ # -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
199
+ # -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
200
+ # -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
201
+ # -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
202
+ # -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
203
+ # -D mapred.min.split.size=1000000000 \\
204
+ # -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
205
+ # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
206
+ # -mapper \"/bin/cat\" \\
207
+ # -reducer \"/usr/bin/uniq\" \\
208
+ # -input \"#{inputs.join(',')}\" \\
209
+ # -output \"#{output}\""
210
+ # puts cmd
211
+ # system cmd
212
+ # end
213
+ #
214
+ # #
215
+ # # Concatenates a hadoop dir or file into a local file
216
+ # #
217
+ # def self.cat_to_local src, dest
218
+ # system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
219
+ # end
220
+ #
221
+
222
+ #
223
+ # Check that we are running with jruby, check for hadoop home. hadoop_home
224
+ # is preferentially set to the HADOOP_HOME environment variable if it's set,
225
+ # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
226
+ # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
227
+ # fails inform the user that HADOOP_HOME really should be set.
228
+ #
229
+ def check_env
230
+ begin
231
+ require 'java'
232
+ rescue LoadError => e
233
+ raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
234
+ end
235
+ @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
236
+ @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
237
+ raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
238
+ end
239
+
240
+ #
241
+ # Place hadoop jars in class path, require appropriate jars, set hadoop conf
242
+ #
243
+ def set_env
244
+ require 'java'
245
+ @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
246
+ @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
247
+ $CLASSPATH << @hadoop_conf
248
+ Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
249
+
250
+ java_import 'org.apache.hadoop.conf.Configuration'
251
+ java_import 'org.apache.hadoop.fs.Path'
252
+ java_import 'org.apache.hadoop.fs.FileSystem'
253
+ java_import 'org.apache.hadoop.fs.FileUtil'
254
+ java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
255
+ java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
256
+ java_import 'org.apache.hadoop.fs.FSDataOutputStream'
257
+ java_import 'org.apache.hadoop.fs.FSDataInputStream'
258
+
259
+ end
260
+
261
+ end
262
+
263
+ end
@@ -0,0 +1,83 @@
1
+ require 'fileutils'
2
+ module Swineherd
3
+
4
+ class LocalFileSystem
5
+
6
+ include Swineherd::BaseFileSystem
7
+
8
+ def initialize *args
9
+ end
10
+
11
+ def open path, mode="r", &blk
12
+ return LocalFile.new path, mode, &blk
13
+ end
14
+
15
+ def rm path
16
+ FileUtils.rm_r path
17
+ end
18
+
19
+ def exists? path
20
+ File.exists?(path)
21
+ end
22
+
23
+ def mv srcpath, dstpath
24
+ FileUtils.mv(srcpath,dstpath)
25
+ end
26
+
27
+ def cp srcpath, dstpath
28
+ FileUtils.cp_r(srcpath,dstpath)
29
+ end
30
+
31
+ def mkpath path
32
+ FileUtils.mkpath path
33
+ end
34
+
35
+ def type path
36
+ case
37
+ when File.symlink?(path) then
38
+ return "symlink"
39
+ when File.directory?(path) then
40
+ return "directory"
41
+ when File.file?(path) then
42
+ return "file"
43
+ end
44
+ "unknown"
45
+ end
46
+
47
+ def entries dirpath
48
+ return unless (type(dirpath) == "directory")
49
+ Dir.entries(dirpath)
50
+ end
51
+
52
+ class LocalFile
53
+ attr_accessor :path, :scheme, :handle, :mode
54
+
55
+ def initialize path, mode="r", &blk
56
+ @path = path
57
+ @mode = mode
58
+ @handle = File.open(path,mode,&blk)
59
+ end
60
+
61
+ def open path, mode="r", &blk
62
+ initialize(path,mode,&blk)
63
+ end
64
+
65
+ def read
66
+ @handle.read
67
+ end
68
+
69
+ def readline
70
+ @handle.gets
71
+ end
72
+
73
+ def write string
74
+ @handle.write(string)
75
+ end
76
+
77
+ def close
78
+ @handle.close
79
+ end
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,11 @@
1
+ module Swineherd
2
+ class LocalFS
3
+ def self.check_paths paths
4
+ exist_count = 0 # no outputs exist
5
+ paths.each{|path| exist_count += 1 if File.exist?(path) }
6
+ raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
7
+ return true if exist_count == 0
8
+ false
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,249 @@
1
+ require 'tempfile'
2
+ module Swineherd
3
+
4
+ #
5
+ # Methods for interacting with Amazon's Simple Store Service (s3).
6
+ #
7
+ class S3FileSystem
8
+
9
+ include Swineherd::BaseFileSystem
10
+
11
+ attr_accessor :s3
12
+
13
+ #
14
+ # Initialize a new s3 file system, needs path to aws keys
15
+ #
16
+ def initialize aws_access_key_id, aws_secret_access_key
17
+ require 'right_aws'
18
+ @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
19
+ end
20
+
21
+ def open path, mode="r", &blk
22
+ S3File.new(path,mode,self,&blk)
23
+ end
24
+
25
+ def rm path
26
+ bkt = bucket(path)
27
+ key = key_path(path)
28
+ if key.empty? # only the bucket was passed in, delete it
29
+ @s3.interface.force_delete_bucket(bkt)
30
+ else
31
+ case type(path)
32
+ when "directory" then
33
+ keys_to_delete = lr(path)
34
+ keys_to_delete.each do |k|
35
+ key_to_delete = key_path(k)
36
+ @s3.interface.delete(bkt, key_to_delete)
37
+ end
38
+ keys_to_delete
39
+ when "file" then
40
+ @s3.interface.delete(bkt, key)
41
+ [path]
42
+ end
43
+ end
44
+ end
45
+
46
+ def bucket path
47
+ uri = URI.parse(path)
48
+ uri.path.split('/').reject{|x| x.empty?}.first
49
+ end
50
+
51
+ def key_path path
52
+ uri = URI.parse(path)
53
+ File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
54
+ end
55
+
56
+ def needs_trailing_slash pre
57
+ has_trailing_slash = pre.end_with? '/'
58
+ is_empty_prefix = pre.empty?
59
+ !(has_trailing_slash || is_empty_prefix)
60
+ end
61
+
62
+ def full_contents path
63
+ bkt = bucket(path)
64
+ pre = key_path(path)
65
+ pre += '/' if needs_trailing_slash(pre)
66
+ contents = []
67
+ s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
68
+ contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
69
+ contents += res[:contents].map{|c| File.join(bkt, c[:key])}
70
+ end
71
+ contents
72
+ end
73
+
74
+ def exists? path
75
+ object = File.basename(path)
76
+ search_dir = File.dirname(path)
77
+ case search_dir
78
+ when '.' then # only a bucket was passed in
79
+ begin
80
+ (full_contents(object).size > 0)
81
+ rescue RightAws::AwsError => e
82
+ if e.message =~ /nosuchbucket/i
83
+ false
84
+ else
85
+ raise e
86
+ end
87
+ end
88
+ else
89
+ search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
90
+ search_dir_contents.include?(object)
91
+ end
92
+ end
93
+
94
+ def mv srcpath, dstpath
95
+ src_bucket = bucket(srcpath)
96
+ dst_bucket = bucket(dstpath)
97
+ dst_key_path = key_path(dstpath)
98
+ mkpath(dstpath)
99
+ case type(srcpath)
100
+ when "directory" then
101
+ paths_to_copy = lr(srcpath)
102
+ common_dir = common_directory(paths_to_copy)
103
+ paths_to_copy.each do |path|
104
+ src_key = key_path(path)
105
+ dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
106
+ @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
107
+ end
108
+ when "file" then
109
+ @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
110
+ end
111
+ end
112
+
113
+ def cp srcpath, dstpath
114
+ src_bucket = bucket(srcpath)
115
+ dst_bucket = bucket(dstpath)
116
+ dst_key_path = key_path(dstpath)
117
+ mkpath(dstpath)
118
+ case type(srcpath)
119
+ when "directory" then
120
+ paths_to_copy = lr(srcpath)
121
+ common_dir = common_directory(paths_to_copy)
122
+ paths_to_copy.each do |path|
123
+ src_key = key_path(path)
124
+ dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
125
+ @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
126
+ end
127
+ when "file" then
128
+ @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
129
+ end
130
+ end
131
+
132
+ #
133
+ # This is a bit funny, there's actually no need to create a 'path' since
134
+ # s3 is nothing more than a glorified key-value store. When you create a
135
+ # 'file' (key) the 'path' will be created for you. All we do here is create
136
+ # the bucket unless it already exists.
137
+ #
138
+ def mkpath path
139
+ bkt = bucket(path)
140
+ key = key_path(path)
141
+ if key.empty?
142
+ @s3.interface.create_bucket(bkt)
143
+ else
144
+ @s3.interface.create_bucket(bkt) unless exists? bkt
145
+ end
146
+ path
147
+ end
148
+
149
+ def type path
150
+ return "unknown" unless exists? path
151
+ return "directory" if full_contents(path).size > 0
152
+ "file"
153
+ end
154
+
155
+ def entries dirpath
156
+ return unless type(dirpath) == "directory"
157
+ full_contents(dirpath)
158
+ end
159
+
160
+ # Recursively list paths
161
+ def lr path
162
+ paths = entries(path)
163
+ if paths
164
+ paths.map{|e| lr(e)}.flatten
165
+ else
166
+ path
167
+ end
168
+ end
169
+
170
+ #
171
+ # Ick.
172
+ #
173
+ def common_directory paths
174
+ dirs = paths.map{|path| path.split('/')}
175
+ min_size = dirs.map{|splits| splits.size}.min
176
+ dirs.map!{|splits| splits[0...min_size]}
177
+ uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
178
+ dirs[0][0...uncommon_idx].join('/')
179
+ end
180
+
181
+ def close *args
182
+ end
183
+
184
+ class S3File
185
+ attr_accessor :path, :handle, :fs
186
+
187
+ #
188
+ # In order to open input and output streams we must pass around the s3 fs object itself
189
+ #
190
+ def initialize path, mode, fs, &blk
191
+ @fs = fs
192
+ @path = path
193
+ case mode
194
+ when "r" then
195
+ raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
196
+ when "w" then
197
+ raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
198
+ @handle = Tempfile.new('s3filestream')
199
+ if block_given?
200
+ yield self
201
+ close
202
+ end
203
+ end
204
+ end
205
+
206
+ #
207
+ # Faster than iterating
208
+ #
209
+ def read
210
+ resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
211
+ resp
212
+ end
213
+
214
+ #
215
+ # This is a little hackety. That is, once you call (.each) on the object the full object starts
216
+ # downloading...
217
+ #
218
+ def readline
219
+ @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
220
+ begin
221
+ @handle.next
222
+ rescue StopIteration, NoMethodError
223
+ @handle = nil
224
+ raise EOFError.new("end of file reached")
225
+ end
226
+ end
227
+
228
+ def write string
229
+ @handle.write(string)
230
+ end
231
+
232
+ def puts string
233
+ write(string+"\n")
234
+ end
235
+
236
+ def close
237
+ if @handle
238
+ @handle.read
239
+ fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
240
+ @handle.close
241
+ end
242
+ @handle = nil
243
+ end
244
+
245
+ end
246
+
247
+ end
248
+
249
+ end