swineherd 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.4
@@ -116,19 +116,46 @@ module Swineherd
116
116
  end
117
117
 
118
118
  #
119
- # BZIP
119
+ # BZIP
120
120
  #
121
121
  def bzip input, output
122
122
  system("#{@hadoop_home}/bin/hadoop \\
123
- jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
123
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
124
124
  -D mapred.output.compress=true \\
125
125
  -D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
126
126
  -D mapred.reduce.tasks=1 \\
127
127
  -mapper \"/bin/cat\" \\
128
- -reducer \"/bin/cat\" \\
128
+ -reducer \"/bin/cat\" \\
129
129
  -input \"#{input}\" \\
130
130
  -output \"#{output}\"")
131
- end
131
+ end
132
+
133
+ #
134
+ # Merges many input files into :reduce_tasks amount of output files
135
+ #
136
+ def dist_merge inputs, output, options = {}
137
+ options[:reduce_tasks] ||= 25
138
+ options[:partition_fields] ||= 2
139
+ options[:sort_fields] ||= 2
140
+ options[:field_separator] ||= '/t'
141
+ names = inputs.map{|inp| File.basename(inp)}.join(',')
142
+ cmd = "#{@hadoop_home}/bin/hadoop \\
143
+ jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
144
+ -D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
145
+ -D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
146
+ -D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
147
+ -D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
148
+ -D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
149
+ -D mapred.min.split.size=1000000000 \\
150
+ -D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
151
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
152
+ -mapper \"/bin/cat\" \\
153
+ -reducer \"/usr/bin/uniq\" \\
154
+ -input \"#{inputs.join(',')}\" \\
155
+ -output \"#{output}\""
156
+ puts cmd
157
+ system cmd
158
+ end
132
159
 
133
160
  #
134
161
  # Copy hdfs file to local filesystem
@@ -138,7 +165,7 @@ module Swineherd
138
165
  end
139
166
 
140
167
  #
141
- # Copyy local file to hdfs filesystem
168
+ # Copy local file to hdfs filesystem
142
169
  #
143
170
  def copy_from_local srcfile, dstfile
144
171
  @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
@@ -259,7 +286,7 @@ module Swineherd
259
286
  require 'java'
260
287
  @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
261
288
  @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
262
- $CLASSPATH << @hadoop_conf
289
+ $CLASSPATH << @hadoop_conf
263
290
  Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
264
291
 
265
292
  java_import 'org.apache.hadoop.conf.Configuration'
@@ -147,7 +147,17 @@ module Swineherd
147
147
  end
148
148
  end
149
149
 
150
- #
150
+ # right now this only works on single files
151
+ def copy_to_local srcpath, dstpath
152
+ src_bucket = bucket(srcpath)
153
+ src_key_path = key_path(srcpath)
154
+ dstfile = File.new(dstpath, 'w')
155
+ @s3.interface.get(src_bucket, src_key_path) do |chunk|
156
+ dstfile.write(chunk)
157
+ end
158
+ dstfile.close
159
+ end
160
+
151
161
  # This is a bit funny, there's actually no need to create a 'path' since
152
162
  # s3 is nothing more than a glorified key-value store. When you create a
153
163
  # 'file' (key) the 'path' will be created for you. All we do here is create
@@ -195,6 +205,16 @@ module Swineherd
195
205
  uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
196
206
  dirs[0][0...uncommon_idx].join('/')
197
207
  end
208
+
209
+ def put srcpath, destpath
210
+ dest_bucket = bucket(destpath)
211
+ if File.directory? srcpath
212
+ # handle Dir later
213
+ else
214
+ key = srcpath
215
+ end
216
+ @s3.interface.put(dest_bucket, key, File.open(srcpath))
217
+ end
198
218
 
199
219
  def close *args
200
220
  end
@@ -234,7 +254,7 @@ module Swineherd
234
254
  # downloading...
235
255
  #
236
256
  def readline
237
- @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
257
+ @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
238
258
  begin
239
259
  @handle.next
240
260
  rescue StopIteration, NoMethodError
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{swineherd}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jacob Perkins"]
12
- s.date = %q{2011-06-01}
12
+ s.date = %q{2011-06-22}
13
13
  s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14
14
  s.email = %q{jacob.a.perkins@gmail.com}
15
15
  s.executables = ["hdp-tree", "hadoop-stream"]
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
39
39
  "lib/swineherd/filesystem/localfilesystem.rb",
40
40
  "lib/swineherd/filesystem/localfs.rb",
41
41
  "lib/swineherd/filesystem/s3filesystem.rb",
42
- "lib/swineherd/foo",
43
42
  "lib/swineherd/script.rb",
44
43
  "lib/swineherd/script/hadoop_script.rb",
45
44
  "lib/swineherd/script/pig_script.rb",
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
57
56
  s.homepage = %q{http://github.com/Ganglion/swineherd}
58
57
  s.licenses = ["MIT"]
59
58
  s.require_paths = ["lib"]
60
- s.rubygems_version = %q{1.4.2}
59
+ s.rubygems_version = %q{1.3.7}
61
60
  s.summary = %q{Flexible data workflow glue.}
62
- s.test_files = [
63
- "examples/pagerank/pagerank.rb",
64
- "examples/pagerank/scripts/cut_off_list.rb"
65
- ]
66
61
 
67
62
  if s.respond_to? :specification_version then
63
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
64
  s.specification_version = 3
69
65
 
70
66
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swineherd
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease:
4
+ hash: 23
5
+ prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jacob Perkins
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-06-01 00:00:00 +00:00
18
+ date: 2011-06-22 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -152,7 +152,6 @@ files:
152
152
  - lib/swineherd/filesystem/localfilesystem.rb
153
153
  - lib/swineherd/filesystem/localfs.rb
154
154
  - lib/swineherd/filesystem/s3filesystem.rb
155
- - lib/swineherd/foo
156
155
  - lib/swineherd/script.rb
157
156
  - lib/swineherd/script/hadoop_script.rb
158
157
  - lib/swineherd/script/pig_script.rb
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
196
195
  requirements: []
197
196
 
198
197
  rubyforge_project:
199
- rubygems_version: 1.4.2
198
+ rubygems_version: 1.3.7
200
199
  signing_key:
201
200
  specification_version: 3
202
201
  summary: Flexible data workflow glue.
203
- test_files:
204
- - examples/pagerank/pagerank.rb
205
- - examples/pagerank/scripts/cut_off_list.rb
202
+ test_files: []
203
+
@@ -1 +0,0 @@
1
- @('_')@