swineherd 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/swineherd/filesystem/hadoopfilesystem.rb +33 -6
- data/lib/swineherd/filesystem/s3filesystem.rb +22 -2
- data/swineherd.gemspec +4 -8
- metadata +8 -10
- data/lib/swineherd/foo +0 -1
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
@@ -116,19 +116,46 @@ module Swineherd
|
|
116
116
|
end
|
117
117
|
|
118
118
|
#
|
119
|
-
# BZIP
|
119
|
+
# BZIP
|
120
120
|
#
|
121
121
|
def bzip input, output
|
122
122
|
system("#{@hadoop_home}/bin/hadoop \\
|
123
|
-
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar
|
123
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
124
124
|
-D mapred.output.compress=true \\
|
125
125
|
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
|
126
126
|
-D mapred.reduce.tasks=1 \\
|
127
127
|
-mapper \"/bin/cat\" \\
|
128
|
-
-reducer
|
128
|
+
-reducer \"/bin/cat\" \\
|
129
129
|
-input \"#{input}\" \\
|
130
130
|
-output \"#{output}\"")
|
131
|
-
end
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Merges many input files into :reduce_tasks amount of output files
|
135
|
+
#
|
136
|
+
def dist_merge inputs, output, options = {}
|
137
|
+
options[:reduce_tasks] ||= 25
|
138
|
+
options[:partition_fields] ||= 2
|
139
|
+
options[:sort_fields] ||= 2
|
140
|
+
options[:field_separator] ||= '/t'
|
141
|
+
names = inputs.map{|inp| File.basename(inp)}.join(',')
|
142
|
+
cmd = "#{@hadoop_home}/bin/hadoop \\
|
143
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
144
|
+
-D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
|
145
|
+
-D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
|
146
|
+
-D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
|
147
|
+
-D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
|
148
|
+
-D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
|
149
|
+
-D mapred.min.split.size=1000000000 \\
|
150
|
+
-D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
|
151
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
|
152
|
+
-mapper \"/bin/cat\" \\
|
153
|
+
-reducer \"/usr/bin/uniq\" \\
|
154
|
+
-input \"#{inputs.join(',')}\" \\
|
155
|
+
-output \"#{output}\""
|
156
|
+
puts cmd
|
157
|
+
system cmd
|
158
|
+
end
|
132
159
|
|
133
160
|
#
|
134
161
|
# Copy hdfs file to local filesystem
|
@@ -138,7 +165,7 @@ module Swineherd
|
|
138
165
|
end
|
139
166
|
|
140
167
|
#
|
141
|
-
#
|
168
|
+
# Copy local file to hdfs filesystem
|
142
169
|
#
|
143
170
|
def copy_from_local srcfile, dstfile
|
144
171
|
@hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
|
@@ -259,7 +286,7 @@ module Swineherd
|
|
259
286
|
require 'java'
|
260
287
|
@hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
|
261
288
|
@hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
|
262
|
-
$CLASSPATH << @hadoop_conf
|
289
|
+
$CLASSPATH << @hadoop_conf
|
263
290
|
Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
|
264
291
|
|
265
292
|
java_import 'org.apache.hadoop.conf.Configuration'
|
@@ -147,7 +147,17 @@ module Swineherd
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
#
|
150
|
+
# right now this only works on single files
|
151
|
+
def copy_to_local srcpath, dstpath
|
152
|
+
src_bucket = bucket(srcpath)
|
153
|
+
src_key_path = key_path(srcpath)
|
154
|
+
dstfile = File.new(dstpath, 'w')
|
155
|
+
@s3.interface.get(src_bucket, src_key_path) do |chunk|
|
156
|
+
dstfile.write(chunk)
|
157
|
+
end
|
158
|
+
dstfile.close
|
159
|
+
end
|
160
|
+
|
151
161
|
# This is a bit funny, there's actually no need to create a 'path' since
|
152
162
|
# s3 is nothing more than a glorified key-value store. When you create a
|
153
163
|
# 'file' (key) the 'path' will be created for you. All we do here is create
|
@@ -195,6 +205,16 @@ module Swineherd
|
|
195
205
|
uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
|
196
206
|
dirs[0][0...uncommon_idx].join('/')
|
197
207
|
end
|
208
|
+
|
209
|
+
def put srcpath, destpath
|
210
|
+
dest_bucket = bucket(destpath)
|
211
|
+
if File.directory? srcpath
|
212
|
+
# handle Dir later
|
213
|
+
else
|
214
|
+
key = srcpath
|
215
|
+
end
|
216
|
+
@s3.interface.put(dest_bucket, key, File.open(srcpath))
|
217
|
+
end
|
198
218
|
|
199
219
|
def close *args
|
200
220
|
end
|
@@ -234,7 +254,7 @@ module Swineherd
|
|
234
254
|
# downloading...
|
235
255
|
#
|
236
256
|
def readline
|
237
|
-
@handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
|
257
|
+
@handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
|
238
258
|
begin
|
239
259
|
@handle.next
|
240
260
|
rescue StopIteration, NoMethodError
|
data/swineherd.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{swineherd}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jacob Perkins"]
|
12
|
-
s.date = %q{2011-06-
|
12
|
+
s.date = %q{2011-06-22}
|
13
13
|
s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
|
14
14
|
s.email = %q{jacob.a.perkins@gmail.com}
|
15
15
|
s.executables = ["hdp-tree", "hadoop-stream"]
|
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
|
|
39
39
|
"lib/swineherd/filesystem/localfilesystem.rb",
|
40
40
|
"lib/swineherd/filesystem/localfs.rb",
|
41
41
|
"lib/swineherd/filesystem/s3filesystem.rb",
|
42
|
-
"lib/swineherd/foo",
|
43
42
|
"lib/swineherd/script.rb",
|
44
43
|
"lib/swineherd/script/hadoop_script.rb",
|
45
44
|
"lib/swineherd/script/pig_script.rb",
|
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
|
|
57
56
|
s.homepage = %q{http://github.com/Ganglion/swineherd}
|
58
57
|
s.licenses = ["MIT"]
|
59
58
|
s.require_paths = ["lib"]
|
60
|
-
s.rubygems_version = %q{1.
|
59
|
+
s.rubygems_version = %q{1.3.7}
|
61
60
|
s.summary = %q{Flexible data workflow glue.}
|
62
|
-
s.test_files = [
|
63
|
-
"examples/pagerank/pagerank.rb",
|
64
|
-
"examples/pagerank/scripts/cut_off_list.rb"
|
65
|
-
]
|
66
61
|
|
67
62
|
if s.respond_to? :specification_version then
|
63
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
68
64
|
s.specification_version = 3
|
69
65
|
|
70
66
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: swineherd
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 23
|
5
|
+
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jacob Perkins
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-06-
|
18
|
+
date: 2011-06-22 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -152,7 +152,6 @@ files:
|
|
152
152
|
- lib/swineherd/filesystem/localfilesystem.rb
|
153
153
|
- lib/swineherd/filesystem/localfs.rb
|
154
154
|
- lib/swineherd/filesystem/s3filesystem.rb
|
155
|
-
- lib/swineherd/foo
|
156
155
|
- lib/swineherd/script.rb
|
157
156
|
- lib/swineherd/script/hadoop_script.rb
|
158
157
|
- lib/swineherd/script/pig_script.rb
|
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
196
195
|
requirements: []
|
197
196
|
|
198
197
|
rubyforge_project:
|
199
|
-
rubygems_version: 1.
|
198
|
+
rubygems_version: 1.3.7
|
200
199
|
signing_key:
|
201
200
|
specification_version: 3
|
202
201
|
summary: Flexible data workflow glue.
|
203
|
-
test_files:
|
204
|
-
|
205
|
-
- examples/pagerank/scripts/cut_off_list.rb
|
202
|
+
test_files: []
|
203
|
+
|
data/lib/swineherd/foo
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
@('_')@
|