swineherd 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,263 @@
1
+ module Swineherd
2
+
3
+ #
4
+ # Methods for dealing with hadoop distributed file system (hdfs). This class
5
+ # requires that you run with JRuby as it makes use of the native java hadoop
6
+ # libraries.
7
+ #
8
+ class HadoopFileSystem
9
+
10
+ include Swineherd::BaseFileSystem
11
+
12
+ attr_accessor :conf, :hdfs
13
+
14
+ #
15
+ # Initialize a new hadoop file system, needs path to hadoop configuration
16
+ #
17
+ def initialize *args
18
+ check_and_set_environment
19
+ @conf = Java::org.apache.hadoop.conf.Configuration.new
20
+ @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
21
+ end
22
+
23
+ #
24
+ # Make sure environment is sane then set up environment for use
25
+ #
26
+ def check_and_set_environment
27
+ check_env
28
+ set_env
29
+ end
30
+
31
+ def open path, mode="r", &blk
32
+ HadoopFile.new(path,mode,self,&blk)
33
+ end
34
+
35
+
36
+ def rm path
37
+ @hdfs.delete(Path.new(path), true)
38
+ [path]
39
+ end
40
+
41
+ def exists? path
42
+ @hdfs.exists(Path.new(path))
43
+ end
44
+
45
+ def mv srcpath, dstpath
46
+ @hdfs.rename(Path.new(srcpath), Path.new(dstpath))
47
+ end
48
+
49
+ def cp srcpath, dstpath
50
+ FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
51
+ end
52
+
53
+ def mkpath path
54
+ @hdfs.mkdirs(Path.new(path))
55
+ path
56
+ end
57
+
58
+ def type path
59
+ return "unknown" unless exists? path
60
+ status = @hdfs.get_file_status(Path.new(path))
61
+ return "directory" if status.is_dir?
62
+ "file"
63
+ # case
64
+ # when status.isFile then
65
+ # return "file"
66
+ # when status.is_directory? then
67
+ # return "directory"
68
+ # when status.is_symlink? then
69
+ # return "symlink"
70
+ # end
71
+ end
72
+
73
+ def entries dirpath
74
+ return unless type(dirpath) == "directory"
75
+ list = @hdfs.list_status(Path.new(dirpath))
76
+ list.map{|path| path.get_path.to_s} rescue []
77
+ end
78
+
79
+ #
80
+ # Merge all part files in a directory into one file.
81
+ #
82
+ def merge srcdir, dstfile
83
+ FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
84
+ end
85
+
86
+ #
87
+ # This is hackety. Use with caution.
88
+ #
89
+ def stream input, output
90
+ require 'uri'
91
+ input_fs_scheme = URI.parse(input).scheme
92
+ output_fs_scheme = URI.parse(output).scheme
93
+ system("#{@hadoop_home}/bin/hadoop \\
94
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
95
+ -D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
96
+ -D mapred.min.split.size=1000000000 \\
97
+ -D mapred.reduce.tasks=0 \\
98
+ -mapper \"/bin/cat\" \\
99
+ -input \"#{input}\" \\
100
+ -output \"#{output}\"")
101
+ end
102
+
103
+ #
104
+ # BZIP
105
+ #
106
+ def bzip input, output
107
+ system("#{@hadoop_home}/bin/hadoop \\
108
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
109
+ -D mapred.output.compress=true \\
110
+ -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
111
+ -D mapred.reduce.tasks=1 \\
112
+ -mapper \"/bin/cat\" \\
113
+ -reducer \"/bin/cat\" \\
114
+ -input \"#{input}\" \\
115
+ -output \"#{output}\"")
116
+ end
117
+
118
+ #
119
+ # Copy hdfs file to local filesystem
120
+ #
121
+ def copy_to_local srcfile, dstfile
122
+ @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
123
+ end
124
+
125
+ #
126
+ # Copyy local file to hdfs filesystem
127
+ #
128
+ def copy_from_local srcfile, dstfile
129
+ @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
130
+ end
131
+
132
+ def close *args
133
+ @hdfs.close
134
+ end
135
+
136
+ class HadoopFile
137
+ attr_accessor :path, :handle, :hdfs
138
+
139
+ #
140
+ # In order to open input and output streams we must pass around the hadoop fs object itself
141
+ #
142
+ def initialize path, mode, fs, &blk
143
+ @fs = fs
144
+ @path = Path.new(path)
145
+ case mode
146
+ when "r" then
147
+ raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
148
+ @handle = @fs.hdfs.open(@path).to_io(&blk)
149
+ when "w" then
150
+ # Open path for writing
151
+ raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
152
+ @handle = @fs.hdfs.create(@path).to_io.to_outputstream
153
+ if block_given?
154
+ yield self
155
+ self.close # muy muy importante
156
+ end
157
+ end
158
+ end
159
+
160
+ def read
161
+ @handle.read
162
+ end
163
+
164
+ def readline
165
+ @handle.readline
166
+ end
167
+
168
+ def write string
169
+ @handle.write(string.to_java_string.get_bytes)
170
+ end
171
+
172
+ def puts string
173
+ write(string+"\n")
174
+ end
175
+
176
+ def close
177
+ @handle.close
178
+ end
179
+
180
+ end
181
+
182
+ # #
183
+ # # Distributed streaming from input to output
184
+ # #
185
+ #
186
+ # #
187
+ # # Given an array of input dirs, stream all into output dir and remove duplicate records.
188
+ # # Reasonable default hadoop streaming options are chosen.
189
+ # #
190
+ # def self.merge inputs, output, options = {}
191
+ # options[:reduce_tasks] ||= 25
192
+ # options[:partition_fields] ||= 2
193
+ # options[:sort_fields] ||= 2
194
+ # options[:field_separator] ||= '/t'
195
+ # names = inputs.map{|inp| File.basename(inp)}.join(',')
196
+ # cmd = "${HADOOP_HOME}/bin/hadoop \\
197
+ # jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \\
198
+ # -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
199
+ # -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
200
+ # -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
201
+ # -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
202
+ # -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
203
+ # -D mapred.min.split.size=1000000000 \\
204
+ # -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
205
+ # -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
206
+ # -mapper \"/bin/cat\" \\
207
+ # -reducer \"/usr/bin/uniq\" \\
208
+ # -input \"#{inputs.join(',')}\" \\
209
+ # -output \"#{output}\""
210
+ # puts cmd
211
+ # system cmd
212
+ # end
213
+ #
214
+ # #
215
+ # # Concatenates a hadoop dir or file into a local file
216
+ # #
217
+ # def self.cat_to_local src, dest
218
+ # system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
219
+ # end
220
+ #
221
+
222
+ #
223
+ # Check that we are running with jruby, check for hadoop home. hadoop_home
224
+ # is preferentially set to the HADOOP_HOME environment variable if it's set,
225
+ # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
226
+ # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
227
+ # fails inform the user that HADOOP_HOME really should be set.
228
+ #
229
+ def check_env
230
+ begin
231
+ require 'java'
232
+ rescue LoadError => e
233
+ raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
234
+ end
235
+ @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
236
+ @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
237
+ raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
238
+ end
239
+
240
+ #
241
+ # Place hadoop jars in class path, require appropriate jars, set hadoop conf
242
+ #
243
+ def set_env
244
+ require 'java'
245
+ @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
246
+ @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
247
+ $CLASSPATH << @hadoop_conf
248
+ Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
249
+
250
+ java_import 'org.apache.hadoop.conf.Configuration'
251
+ java_import 'org.apache.hadoop.fs.Path'
252
+ java_import 'org.apache.hadoop.fs.FileSystem'
253
+ java_import 'org.apache.hadoop.fs.FileUtil'
254
+ java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
255
+ java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
256
+ java_import 'org.apache.hadoop.fs.FSDataOutputStream'
257
+ java_import 'org.apache.hadoop.fs.FSDataInputStream'
258
+
259
+ end
260
+
261
+ end
262
+
263
+ end
@@ -0,0 +1,83 @@
1
+ require 'fileutils'
2
+ module Swineherd
3
+
4
+ class LocalFileSystem
5
+
6
+ include Swineherd::BaseFileSystem
7
+
8
+ def initialize *args
9
+ end
10
+
11
+ def open path, mode="r", &blk
12
+ return LocalFile.new path, mode, &blk
13
+ end
14
+
15
+ def rm path
16
+ FileUtils.rm_r path
17
+ end
18
+
19
+ def exists? path
20
+ File.exists?(path)
21
+ end
22
+
23
+ def mv srcpath, dstpath
24
+ FileUtils.mv(srcpath,dstpath)
25
+ end
26
+
27
+ def cp srcpath, dstpath
28
+ FileUtils.cp_r(srcpath,dstpath)
29
+ end
30
+
31
+ def mkpath path
32
+ FileUtils.mkpath path
33
+ end
34
+
35
+ def type path
36
+ case
37
+ when File.symlink?(path) then
38
+ return "symlink"
39
+ when File.directory?(path) then
40
+ return "directory"
41
+ when File.file?(path) then
42
+ return "file"
43
+ end
44
+ "unknown"
45
+ end
46
+
47
+ def entries dirpath
48
+ return unless (type(dirpath) == "directory")
49
+ Dir.entries(dirpath)
50
+ end
51
+
52
+ class LocalFile
53
+ attr_accessor :path, :scheme, :handle, :mode
54
+
55
+ def initialize path, mode="r", &blk
56
+ @path = path
57
+ @mode = mode
58
+ @handle = File.open(path,mode,&blk)
59
+ end
60
+
61
+ def open path, mode="r", &blk
62
+ initialize(path,mode,&blk)
63
+ end
64
+
65
+ def read
66
+ @handle.read
67
+ end
68
+
69
+ def readline
70
+ @handle.gets
71
+ end
72
+
73
+ def write string
74
+ @handle.write(string)
75
+ end
76
+
77
+ def close
78
+ @handle.close
79
+ end
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,11 @@
1
+ module Swineherd
2
+ class LocalFS
3
+ def self.check_paths paths
4
+ exist_count = 0 # no outputs exist
5
+ paths.each{|path| exist_count += 1 if File.exist?(path) }
6
+ raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
7
+ return true if exist_count == 0
8
+ false
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,249 @@
1
+ require 'tempfile'
2
+ module Swineherd
3
+
4
+ #
5
+ # Methods for interacting with Amazon's Simple Store Service (s3).
6
+ #
7
+ class S3FileSystem
8
+
9
+ include Swineherd::BaseFileSystem
10
+
11
+ attr_accessor :s3
12
+
13
+ #
14
+ # Initialize a new s3 file system, needs path to aws keys
15
+ #
16
+ def initialize aws_access_key_id, aws_secret_access_key
17
+ require 'right_aws'
18
+ @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
19
+ end
20
+
21
+ def open path, mode="r", &blk
22
+ S3File.new(path,mode,self,&blk)
23
+ end
24
+
25
+ def rm path
26
+ bkt = bucket(path)
27
+ key = key_path(path)
28
+ if key.empty? # only the bucket was passed in, delete it
29
+ @s3.interface.force_delete_bucket(bkt)
30
+ else
31
+ case type(path)
32
+ when "directory" then
33
+ keys_to_delete = lr(path)
34
+ keys_to_delete.each do |k|
35
+ key_to_delete = key_path(k)
36
+ @s3.interface.delete(bkt, key_to_delete)
37
+ end
38
+ keys_to_delete
39
+ when "file" then
40
+ @s3.interface.delete(bkt, key)
41
+ [path]
42
+ end
43
+ end
44
+ end
45
+
46
+ def bucket path
47
+ uri = URI.parse(path)
48
+ uri.path.split('/').reject{|x| x.empty?}.first
49
+ end
50
+
51
+ def key_path path
52
+ uri = URI.parse(path)
53
+ File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
54
+ end
55
+
56
+ def needs_trailing_slash pre
57
+ has_trailing_slash = pre.end_with? '/'
58
+ is_empty_prefix = pre.empty?
59
+ !(has_trailing_slash || is_empty_prefix)
60
+ end
61
+
62
+ def full_contents path
63
+ bkt = bucket(path)
64
+ pre = key_path(path)
65
+ pre += '/' if needs_trailing_slash(pre)
66
+ contents = []
67
+ s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
68
+ contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
69
+ contents += res[:contents].map{|c| File.join(bkt, c[:key])}
70
+ end
71
+ contents
72
+ end
73
+
74
+ def exists? path
75
+ object = File.basename(path)
76
+ search_dir = File.dirname(path)
77
+ case search_dir
78
+ when '.' then # only a bucket was passed in
79
+ begin
80
+ (full_contents(object).size > 0)
81
+ rescue RightAws::AwsError => e
82
+ if e.message =~ /nosuchbucket/i
83
+ false
84
+ else
85
+ raise e
86
+ end
87
+ end
88
+ else
89
+ search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
90
+ search_dir_contents.include?(object)
91
+ end
92
+ end
93
+
94
+ def mv srcpath, dstpath
95
+ src_bucket = bucket(srcpath)
96
+ dst_bucket = bucket(dstpath)
97
+ dst_key_path = key_path(dstpath)
98
+ mkpath(dstpath)
99
+ case type(srcpath)
100
+ when "directory" then
101
+ paths_to_copy = lr(srcpath)
102
+ common_dir = common_directory(paths_to_copy)
103
+ paths_to_copy.each do |path|
104
+ src_key = key_path(path)
105
+ dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
106
+ @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
107
+ end
108
+ when "file" then
109
+ @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
110
+ end
111
+ end
112
+
113
+ def cp srcpath, dstpath
114
+ src_bucket = bucket(srcpath)
115
+ dst_bucket = bucket(dstpath)
116
+ dst_key_path = key_path(dstpath)
117
+ mkpath(dstpath)
118
+ case type(srcpath)
119
+ when "directory" then
120
+ paths_to_copy = lr(srcpath)
121
+ common_dir = common_directory(paths_to_copy)
122
+ paths_to_copy.each do |path|
123
+ src_key = key_path(path)
124
+ dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
125
+ @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
126
+ end
127
+ when "file" then
128
+ @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
129
+ end
130
+ end
131
+
132
+ #
133
+ # This is a bit funny, there's actually no need to create a 'path' since
134
+ # s3 is nothing more than a glorified key-value store. When you create a
135
+ # 'file' (key) the 'path' will be created for you. All we do here is create
136
+ # the bucket unless it already exists.
137
+ #
138
+ def mkpath path
139
+ bkt = bucket(path)
140
+ key = key_path(path)
141
+ if key.empty?
142
+ @s3.interface.create_bucket(bkt)
143
+ else
144
+ @s3.interface.create_bucket(bkt) unless exists? bkt
145
+ end
146
+ path
147
+ end
148
+
149
+ def type path
150
+ return "unknown" unless exists? path
151
+ return "directory" if full_contents(path).size > 0
152
+ "file"
153
+ end
154
+
155
+ def entries dirpath
156
+ return unless type(dirpath) == "directory"
157
+ full_contents(dirpath)
158
+ end
159
+
160
+ # Recursively list paths
161
+ def lr path
162
+ paths = entries(path)
163
+ if paths
164
+ paths.map{|e| lr(e)}.flatten
165
+ else
166
+ path
167
+ end
168
+ end
169
+
170
+ #
171
+ # Ick.
172
+ #
173
+ def common_directory paths
174
+ dirs = paths.map{|path| path.split('/')}
175
+ min_size = dirs.map{|splits| splits.size}.min
176
+ dirs.map!{|splits| splits[0...min_size]}
177
+ uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
178
+ dirs[0][0...uncommon_idx].join('/')
179
+ end
180
+
181
+ def close *args
182
+ end
183
+
184
+ class S3File
185
+ attr_accessor :path, :handle, :fs
186
+
187
+ #
188
+ # In order to open input and output streams we must pass around the s3 fs object itself
189
+ #
190
+ def initialize path, mode, fs, &blk
191
+ @fs = fs
192
+ @path = path
193
+ case mode
194
+ when "r" then
195
+ raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
196
+ when "w" then
197
+ raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
198
+ @handle = Tempfile.new('s3filestream')
199
+ if block_given?
200
+ yield self
201
+ close
202
+ end
203
+ end
204
+ end
205
+
206
+ #
207
+ # Faster than iterating
208
+ #
209
+ def read
210
+ resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
211
+ resp
212
+ end
213
+
214
+ #
215
+ # This is a little hackety. That is, once you call (.each) on the object the full object starts
216
+ # downloading...
217
+ #
218
+ def readline
219
+ @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
220
+ begin
221
+ @handle.next
222
+ rescue StopIteration, NoMethodError
223
+ @handle = nil
224
+ raise EOFError.new("end of file reached")
225
+ end
226
+ end
227
+
228
+ def write string
229
+ @handle.write(string)
230
+ end
231
+
232
+ def puts string
233
+ write(string+"\n")
234
+ end
235
+
236
+ def close
237
+ if @handle
238
+ @handle.read
239
+ fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
240
+ @handle.close
241
+ end
242
+ @handle = nil
243
+ end
244
+
245
+ end
246
+
247
+ end
248
+
249
+ end