swineherd 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/swineherd/filesystem/hadoopfilesystem.rb +33 -6
- data/lib/swineherd/filesystem/s3filesystem.rb +22 -2
- data/swineherd.gemspec +4 -8
- metadata +8 -10
- data/lib/swineherd/foo +0 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
@@ -116,19 +116,46 @@ module Swineherd
|
|
116
116
|
end
|
117
117
|
|
118
118
|
#
|
119
|
-
# BZIP
|
119
|
+
# BZIP
|
120
120
|
#
|
121
121
|
def bzip input, output
|
122
122
|
system("#{@hadoop_home}/bin/hadoop \\
|
123
|
-
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar
|
123
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
124
124
|
-D mapred.output.compress=true \\
|
125
125
|
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
|
126
126
|
-D mapred.reduce.tasks=1 \\
|
127
127
|
-mapper \"/bin/cat\" \\
|
128
|
-
-reducer
|
128
|
+
-reducer \"/bin/cat\" \\
|
129
129
|
-input \"#{input}\" \\
|
130
130
|
-output \"#{output}\"")
|
131
|
-
end
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Merges many input files into :reduce_tasks amount of output files
|
135
|
+
#
|
136
|
+
def dist_merge inputs, output, options = {}
|
137
|
+
options[:reduce_tasks] ||= 25
|
138
|
+
options[:partition_fields] ||= 2
|
139
|
+
options[:sort_fields] ||= 2
|
140
|
+
options[:field_separator] ||= '/t'
|
141
|
+
names = inputs.map{|inp| File.basename(inp)}.join(',')
|
142
|
+
cmd = "#{@hadoop_home}/bin/hadoop \\
|
143
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
144
|
+
-D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
|
145
|
+
-D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
|
146
|
+
-D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
|
147
|
+
-D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
|
148
|
+
-D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
|
149
|
+
-D mapred.min.split.size=1000000000 \\
|
150
|
+
-D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
|
151
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
|
152
|
+
-mapper \"/bin/cat\" \\
|
153
|
+
-reducer \"/usr/bin/uniq\" \\
|
154
|
+
-input \"#{inputs.join(',')}\" \\
|
155
|
+
-output \"#{output}\""
|
156
|
+
puts cmd
|
157
|
+
system cmd
|
158
|
+
end
|
132
159
|
|
133
160
|
#
|
134
161
|
# Copy hdfs file to local filesystem
|
@@ -138,7 +165,7 @@ module Swineherd
|
|
138
165
|
end
|
139
166
|
|
140
167
|
#
|
141
|
-
#
|
168
|
+
# Copy local file to hdfs filesystem
|
142
169
|
#
|
143
170
|
def copy_from_local srcfile, dstfile
|
144
171
|
@hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
|
@@ -259,7 +286,7 @@ module Swineherd
|
|
259
286
|
require 'java'
|
260
287
|
@hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
|
261
288
|
@hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
|
262
|
-
$CLASSPATH << @hadoop_conf
|
289
|
+
$CLASSPATH << @hadoop_conf
|
263
290
|
Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
|
264
291
|
|
265
292
|
java_import 'org.apache.hadoop.conf.Configuration'
|
@@ -147,7 +147,17 @@ module Swineherd
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
#
|
150
|
+
# right now this only works on single files
|
151
|
+
def copy_to_local srcpath, dstpath
|
152
|
+
src_bucket = bucket(srcpath)
|
153
|
+
src_key_path = key_path(srcpath)
|
154
|
+
dstfile = File.new(dstpath, 'w')
|
155
|
+
@s3.interface.get(src_bucket, src_key_path) do |chunk|
|
156
|
+
dstfile.write(chunk)
|
157
|
+
end
|
158
|
+
dstfile.close
|
159
|
+
end
|
160
|
+
|
151
161
|
# This is a bit funny, there's actually no need to create a 'path' since
|
152
162
|
# s3 is nothing more than a glorified key-value store. When you create a
|
153
163
|
# 'file' (key) the 'path' will be created for you. All we do here is create
|
@@ -195,6 +205,16 @@ module Swineherd
|
|
195
205
|
uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
|
196
206
|
dirs[0][0...uncommon_idx].join('/')
|
197
207
|
end
|
208
|
+
|
209
|
+
def put srcpath, destpath
|
210
|
+
dest_bucket = bucket(destpath)
|
211
|
+
if File.directory? srcpath
|
212
|
+
# handle Dir later
|
213
|
+
else
|
214
|
+
key = srcpath
|
215
|
+
end
|
216
|
+
@s3.interface.put(dest_bucket, key, File.open(srcpath))
|
217
|
+
end
|
198
218
|
|
199
219
|
def close *args
|
200
220
|
end
|
@@ -234,7 +254,7 @@ module Swineherd
|
|
234
254
|
# downloading...
|
235
255
|
#
|
236
256
|
def readline
|
237
|
-
@handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
|
257
|
+
@handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
|
238
258
|
begin
|
239
259
|
@handle.next
|
240
260
|
rescue StopIteration, NoMethodError
|
data/swineherd.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{swineherd}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jacob Perkins"]
|
12
|
-
s.date = %q{2011-06-
|
12
|
+
s.date = %q{2011-06-22}
|
13
13
|
s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
|
14
14
|
s.email = %q{jacob.a.perkins@gmail.com}
|
15
15
|
s.executables = ["hdp-tree", "hadoop-stream"]
|
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
|
|
39
39
|
"lib/swineherd/filesystem/localfilesystem.rb",
|
40
40
|
"lib/swineherd/filesystem/localfs.rb",
|
41
41
|
"lib/swineherd/filesystem/s3filesystem.rb",
|
42
|
-
"lib/swineherd/foo",
|
43
42
|
"lib/swineherd/script.rb",
|
44
43
|
"lib/swineherd/script/hadoop_script.rb",
|
45
44
|
"lib/swineherd/script/pig_script.rb",
|
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
|
|
57
56
|
s.homepage = %q{http://github.com/Ganglion/swineherd}
|
58
57
|
s.licenses = ["MIT"]
|
59
58
|
s.require_paths = ["lib"]
|
60
|
-
s.rubygems_version = %q{1.
|
59
|
+
s.rubygems_version = %q{1.3.7}
|
61
60
|
s.summary = %q{Flexible data workflow glue.}
|
62
|
-
s.test_files = [
|
63
|
-
"examples/pagerank/pagerank.rb",
|
64
|
-
"examples/pagerank/scripts/cut_off_list.rb"
|
65
|
-
]
|
66
61
|
|
67
62
|
if s.respond_to? :specification_version then
|
63
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
68
64
|
s.specification_version = 3
|
69
65
|
|
70
66
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: swineherd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jacob Perkins
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-06-
|
18
|
+
date: 2011-06-22 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -152,7 +152,6 @@ files:
|
|
152
152
|
- lib/swineherd/filesystem/localfilesystem.rb
|
153
153
|
- lib/swineherd/filesystem/localfs.rb
|
154
154
|
- lib/swineherd/filesystem/s3filesystem.rb
|
155
|
-
- lib/swineherd/foo
|
156
155
|
- lib/swineherd/script.rb
|
157
156
|
- lib/swineherd/script/hadoop_script.rb
|
158
157
|
- lib/swineherd/script/pig_script.rb
|
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
196
195
|
requirements: []
|
197
196
|
|
198
197
|
rubyforge_project:
|
199
|
-
rubygems_version: 1.
|
198
|
+
rubygems_version: 1.3.7
|
200
199
|
signing_key:
|
201
200
|
specification_version: 3
|
202
201
|
summary: Flexible data workflow glue.
|
203
|
-
test_files:
|
204
|
-
|
205
|
-
- examples/pagerank/scripts/cut_off_list.rb
|
202
|
+
test_files: []
|
203
|
+
|
data/lib/swineherd/foo
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
@('_')@
|