swineherd 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.4
@@ -116,19 +116,46 @@ module Swineherd
116
116
  end
117
117
 
118
118
  #
119
- # BZIP
119
+ # BZIP
120
120
  #
121
121
  def bzip input, output
122
122
  system("#{@hadoop_home}/bin/hadoop \\
123
- jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
123
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
124
124
  -D mapred.output.compress=true \\
125
125
  -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
126
126
  -D mapred.reduce.tasks=1 \\
127
127
  -mapper \"/bin/cat\" \\
128
- -reducer \"/bin/cat\" \\
128
+ -reducer \"/bin/cat\" \\
129
129
  -input \"#{input}\" \\
130
130
  -output \"#{output}\"")
131
- end
131
+ end
132
+
133
+ #
134
+ # Merges many input files into :reduce_tasks amount of output files
135
+ #
136
+ def dist_merge inputs, output, options = {}
137
+ options[:reduce_tasks] ||= 25
138
+ options[:partition_fields] ||= 2
139
+ options[:sort_fields] ||= 2
140
+ options[:field_separator] ||= '/t'
141
+ names = inputs.map{|inp| File.basename(inp)}.join(',')
142
+ cmd = "#{@hadoop_home}/bin/hadoop \\
143
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
144
+ -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
145
+ -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
146
+ -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
147
+ -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
148
+ -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
149
+ -D mapred.min.split.size=1000000000 \\
150
+ -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
151
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
152
+ -mapper \"/bin/cat\" \\
153
+ -reducer \"/usr/bin/uniq\" \\
154
+ -input \"#{inputs.join(',')}\" \\
155
+ -output \"#{output}\""
156
+ puts cmd
157
+ system cmd
158
+ end
132
159
 
133
160
  #
134
161
  # Copy hdfs file to local filesystem
@@ -138,7 +165,7 @@ module Swineherd
138
165
  end
139
166
 
140
167
  #
141
- # Copyy local file to hdfs filesystem
168
+ # Copy local file to hdfs filesystem
142
169
  #
143
170
  def copy_from_local srcfile, dstfile
144
171
  @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
@@ -259,7 +286,7 @@ module Swineherd
259
286
  require 'java'
260
287
  @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
261
288
  @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
262
- $CLASSPATH << @hadoop_conf
289
+ $CLASSPATH << @hadoop_conf
263
290
  Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
264
291
 
265
292
  java_import 'org.apache.hadoop.conf.Configuration'
@@ -147,7 +147,17 @@ module Swineherd
147
147
  end
148
148
  end
149
149
 
150
- #
150
+ # right now this only works on single files
151
+ def copy_to_local srcpath, dstpath
152
+ src_bucket = bucket(srcpath)
153
+ src_key_path = key_path(srcpath)
154
+ dstfile = File.new(dstpath, 'w')
155
+ @s3.interface.get(src_bucket, src_key_path) do |chunk|
156
+ dstfile.write(chunk)
157
+ end
158
+ dstfile.close
159
+ end
160
+
151
161
  # This is a bit funny, there's actually no need to create a 'path' since
152
162
  # s3 is nothing more than a glorified key-value store. When you create a
153
163
  # 'file' (key) the 'path' will be created for you. All we do here is create
@@ -195,6 +205,16 @@ module Swineherd
195
205
  uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
196
206
  dirs[0][0...uncommon_idx].join('/')
197
207
  end
208
+
209
+ def put srcpath, destpath
210
+ dest_bucket = bucket(destpath)
211
+ if File.directory? srcpath
212
+ # handle Dir later
213
+ else
214
+ key = srcpath
215
+ end
216
+ @s3.interface.put(dest_bucket, key, File.open(srcpath))
217
+ end
198
218
 
199
219
  def close *args
200
220
  end
@@ -234,7 +254,7 @@ module Swineherd
234
254
  # downloading...
235
255
  #
236
256
  def readline
237
- @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
257
+ @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
238
258
  begin
239
259
  @handle.next
240
260
  rescue StopIteration, NoMethodError
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{swineherd}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jacob Perkins"]
12
- s.date = %q{2011-06-01}
12
+ s.date = %q{2011-06-22}
13
13
  s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14
14
  s.email = %q{jacob.a.perkins@gmail.com}
15
15
  s.executables = ["hdp-tree", "hadoop-stream"]
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
39
39
  "lib/swineherd/filesystem/localfilesystem.rb",
40
40
  "lib/swineherd/filesystem/localfs.rb",
41
41
  "lib/swineherd/filesystem/s3filesystem.rb",
42
- "lib/swineherd/foo",
43
42
  "lib/swineherd/script.rb",
44
43
  "lib/swineherd/script/hadoop_script.rb",
45
44
  "lib/swineherd/script/pig_script.rb",
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
57
56
  s.homepage = %q{http://github.com/Ganglion/swineherd}
58
57
  s.licenses = ["MIT"]
59
58
  s.require_paths = ["lib"]
60
- s.rubygems_version = %q{1.4.2}
59
+ s.rubygems_version = %q{1.3.7}
61
60
  s.summary = %q{Flexible data workflow glue.}
62
- s.test_files = [
63
- "examples/pagerank/pagerank.rb",
64
- "examples/pagerank/scripts/cut_off_list.rb"
65
- ]
66
61
 
67
62
  if s.respond_to? :specification_version then
63
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
64
  s.specification_version = 3
69
65
 
70
66
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swineherd
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease:
4
+ hash: 23
5
+ prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jacob Perkins
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-06-01 00:00:00 +00:00
18
+ date: 2011-06-22 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -152,7 +152,6 @@ files:
152
152
  - lib/swineherd/filesystem/localfilesystem.rb
153
153
  - lib/swineherd/filesystem/localfs.rb
154
154
  - lib/swineherd/filesystem/s3filesystem.rb
155
- - lib/swineherd/foo
156
155
  - lib/swineherd/script.rb
157
156
  - lib/swineherd/script/hadoop_script.rb
158
157
  - lib/swineherd/script/pig_script.rb
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
196
195
  requirements: []
197
196
 
198
197
  rubyforge_project:
199
- rubygems_version: 1.4.2
198
+ rubygems_version: 1.3.7
200
199
  signing_key:
201
200
  specification_version: 3
202
201
  summary: Flexible data workflow glue.
203
- test_files:
204
- - examples/pagerank/pagerank.rb
205
- - examples/pagerank/scripts/cut_off_list.rb
202
+ test_files: []
203
+
@@ -1 +0,0 @@
1
- @('_')@