RubyGems - swineherd - Versions diffs - 0.0.2 → 0.0.4 - Mend

swineherd 0.0.2 → 0.0.4

Files changed (6) hide show

data/VERSION +1 -1
data/lib/swineherd/filesystem/hadoopfilesystem.rb +33 -6
data/lib/swineherd/filesystem/s3filesystem.rb +22 -2
data/swineherd.gemspec +4 -8
metadata +8 -10
data/lib/swineherd/foo +0 -1

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.0.2
1	+ 0.0.4

data/lib/swineherd/filesystem/hadoopfilesystem.rb CHANGED

@@ -116,19 +116,46 @@ module Swineherd
     end
     #
-    # BZIP
+    # BZIP
     #
     def bzip input, output
       system("#{@hadoop_home}/bin/hadoop \\
-       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar	\\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar     \\
        -D          mapred.output.compress=true                                  \\
        -D          mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \\
        -D          mapred.reduce.tasks=1                                        \\
        -mapper     \"/bin/cat\"                                                 \\
-       -reducer	   \"/bin/cat\"                                                 \\
+       -reducer    \"/bin/cat\"                                                 \\
        -input      \"#{input}\"                                                 \\
        -output     \"#{output}\"")
-    end
+    end
+    #
+    # Merges many input files into :reduce_tasks amount of output files
+    #
+    def dist_merge inputs, output, options = {}
+      options[:reduce_tasks]     ||= 25
+      options[:partition_fields] ||= 2
+      options[:sort_fields]      ||= 2
+      options[:field_separator]  ||= '/t'
+      names = inputs.map{|inp| File.basename(inp)}.join(',')
+      cmd   = "#{@hadoop_home}/bin/hadoop \\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar                   \\
+       -D          mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\"               \\
+       -D          num.key.fields.for.partition=\"#{options[:partition_fields]}\"            \\
+       -D          stream.num.map.output.key.fields=\"#{options[:sort_fields]}\"             \\
+       -D          mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
+       -D          stream.map.output.field.separator=\"'#{options[:field_separator]}'\"      \\
+       -D          mapred.min.split.size=1000000000                                          \\
+       -D          mapred.reduce.tasks=#{options[:reduce_tasks]}                             \\
+       -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner                    \\
+       -mapper     \"/bin/cat\"                                                              \\
+       -reducer    \"/usr/bin/uniq\"                                                         \\
+       -input      \"#{inputs.join(',')}\"                                                   \\
+       -output     \"#{output}\""
+      puts cmd
+      system cmd
+    end
     #
     # Copy hdfs file to local filesystem
@@ -138,7 +165,7 @@ module Swineherd
     end
     #
-    # Copyy local file to hdfs filesystem
+    # Copy local file to hdfs filesystem
     #
     def copy_from_local srcfile, dstfile
       @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
@@ -259,7 +286,7 @@ module Swineherd
       require 'java'
       @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
       @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
-      $CLASSPATH << @hadoop_conf
+      $CLASSPATH << @hadoop_conf
       Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
       java_import 'org.apache.hadoop.conf.Configuration'

data/lib/swineherd/filesystem/s3filesystem.rb CHANGED

@@ -147,7 +147,17 @@ module Swineherd
       end
     end
-    #
+    # right now this only works on single files
+    def copy_to_local srcpath, dstpath
+      src_bucket   = bucket(srcpath)
+      src_key_path = key_path(srcpath)
+      dstfile      = File.new(dstpath, 'w')
+      @s3.interface.get(src_bucket, src_key_path) do |chunk|
+        dstfile.write(chunk)
+      end
+      dstfile.close
+    end
     # This is a bit funny, there's actually no need to create a 'path' since
     # s3 is nothing more than a glorified key-value store. When you create a
     # 'file' (key) the 'path' will be created for you. All we do here is create
@@ -195,6 +205,16 @@ module Swineherd
       uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
       dirs[0][0...uncommon_idx].join('/')
     end
+    def put srcpath, destpath
+      dest_bucket = bucket(destpath)
+      if File.directory? srcpath
+	# handle Dir later
+      else
+        key = srcpath
+      end
+      @s3.interface.put(dest_bucket, key, File.open(srcpath))
+    end
     def close *args
     end
@@ -234,7 +254,7 @@ module Swineherd
       # downloading...
       #
       def readline
-        @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
+        @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
         begin
           @handle.next
         rescue StopIteration, NoMethodError

data/swineherd.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{swineherd}
-  s.version = "0.0.2"
+  s.version = "0.0.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jacob Perkins"]
-  s.date = %q{2011-06-01}
+  s.date = %q{2011-06-22}
   s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
   s.email = %q{jacob.a.perkins@gmail.com}
   s.executables = ["hdp-tree", "hadoop-stream"]
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
     "lib/swineherd/filesystem/localfilesystem.rb",
     "lib/swineherd/filesystem/localfs.rb",
     "lib/swineherd/filesystem/s3filesystem.rb",
-    "lib/swineherd/foo",
     "lib/swineherd/script.rb",
     "lib/swineherd/script/hadoop_script.rb",
     "lib/swineherd/script/pig_script.rb",
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
   s.homepage = %q{http://github.com/Ganglion/swineherd}
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.4.2}
+  s.rubygems_version = %q{1.3.7}
   s.summary = %q{Flexible data workflow glue.}
-  s.test_files = [
-    "examples/pagerank/pagerank.rb",
-    "examples/pagerank/scripts/cut_off_list.rb"
-  ]
   if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
     s.specification_version = 3
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: swineherd
 version: !ruby/object:Gem::Version
-  hash: 27
-  prerelease:
+  hash: 23
+  prerelease: false
   segments:
   - 0
   - 0
-  - 2
-  version: 0.0.2
+  - 4
+  version: 0.0.4
 platform: ruby
 authors:
 - Jacob Perkins
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-06-01 00:00:00 +00:00
+date: 2011-06-22 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -152,7 +152,6 @@ files:
 - lib/swineherd/filesystem/localfilesystem.rb
 - lib/swineherd/filesystem/localfs.rb
 - lib/swineherd/filesystem/s3filesystem.rb
-- lib/swineherd/foo
 - lib/swineherd/script.rb
 - lib/swineherd/script/hadoop_script.rb
 - lib/swineherd/script/pig_script.rb
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.4.2
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Flexible data workflow glue.
-test_files:
-- examples/pagerank/pagerank.rb
-- examples/pagerank/scripts/cut_off_list.rb
+test_files: []

data/lib/swineherd/foo DELETED

	@@ -1 +0,0 @@
1	- @('_')@