RubyGems - swineherd - Versions diffs - 0.0.2 → 0.0.4 - Mend

swineherd 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/VERSION +1 -1
data/lib/swineherd/filesystem/hadoopfilesystem.rb +33 -6
data/lib/swineherd/filesystem/s3filesystem.rb +22 -2
data/swineherd.gemspec +4 -8
metadata +8 -10
data/lib/swineherd/foo +0 -1

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.0.2
1	+ 0.0.4

data/lib/swineherd/filesystem/hadoopfilesystem.rb CHANGED

@@ -116,19 +116,46 @@ module Swineherd
     end
     #
-    # BZIP
+    # BZIP
     #
     def bzip input, output
       system("#{@hadoop_home}/bin/hadoop \\
-       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar	\\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar     \\
        -D          mapred.output.compress=true                                  \\
        -D          mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \\
        -D          mapred.reduce.tasks=1                                        \\
        -mapper     \"/bin/cat\"                                                 \\
-       -reducer	   \"/bin/cat\"                                                 \\
+       -reducer    \"/bin/cat\"                                                 \\
        -input      \"#{input}\"                                                 \\
        -output     \"#{output}\"")
-    end
+    end
+    #
+    # Merges many input files into :reduce_tasks amount of output files
+    #
+    def dist_merge inputs, output, options = {}
+      options[:reduce_tasks]     ||= 25
+      options[:partition_fields] ||= 2
+      options[:sort_fields]      ||= 2
+      options[:field_separator]  ||= '/t'
+      names = inputs.map{|inp| File.basename(inp)}.join(',')
+      cmd   = "#{@hadoop_home}/bin/hadoop \\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar                   \\
+       -D          mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\"               \\
+       -D          num.key.fields.for.partition=\"#{options[:partition_fields]}\"            \\
+       -D          stream.num.map.output.key.fields=\"#{options[:sort_fields]}\"             \\
+       -D          mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
+       -D          stream.map.output.field.separator=\"'#{options[:field_separator]}'\"      \\
+       -D          mapred.min.split.size=1000000000                                          \\
+       -D          mapred.reduce.tasks=#{options[:reduce_tasks]}                             \\
+       -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner                    \\
+       -mapper     \"/bin/cat\"                                                              \\
+       -reducer    \"/usr/bin/uniq\"                                                         \\
+       -input      \"#{inputs.join(',')}\"                                                   \\
+       -output     \"#{output}\""
+      puts cmd
+      system cmd
+    end
     #
     # Copy hdfs file to local filesystem
@@ -138,7 +165,7 @@ module Swineherd
     end
     #
-    # Copyy local file to hdfs filesystem
+    # Copy local file to hdfs filesystem
     #
     def copy_from_local srcfile, dstfile
       @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
@@ -259,7 +286,7 @@ module Swineherd
       require 'java'
       @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
       @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
-      $CLASSPATH << @hadoop_conf
+      $CLASSPATH << @hadoop_conf
       Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
       java_import 'org.apache.hadoop.conf.Configuration'

data/lib/swineherd/filesystem/s3filesystem.rb CHANGED

@@ -147,7 +147,17 @@ module Swineherd
       end
     end
-    #
+    # right now this only works on single files
+    def copy_to_local srcpath, dstpath
+      src_bucket   = bucket(srcpath)
+      src_key_path = key_path(srcpath)
+      dstfile      = File.new(dstpath, 'w')
+      @s3.interface.get(src_bucket, src_key_path) do |chunk|
+        dstfile.write(chunk)
+      end
+      dstfile.close
+    end
     # This is a bit funny, there's actually no need to create a 'path' since
     # s3 is nothing more than a glorified key-value store. When you create a
     # 'file' (key) the 'path' will be created for you. All we do here is create
@@ -195,6 +205,16 @@ module Swineherd
       uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
       dirs[0][0...uncommon_idx].join('/')
     end
+    def put srcpath, destpath
+      dest_bucket = bucket(destpath)
+      if File.directory? srcpath
+	# handle Dir later
+      else
+        key = srcpath
+      end
+      @s3.interface.put(dest_bucket, key, File.open(srcpath))
+    end
     def close *args
     end
@@ -234,7 +254,7 @@ module Swineherd
       # downloading...
       #
       def readline
-        @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
+        @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
         begin
           @handle.next
         rescue StopIteration, NoMethodError

data/swineherd.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{swineherd}
-  s.version = "0.0.2"
+  s.version = "0.0.4"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jacob Perkins"]
-  s.date = %q{2011-06-01}
+  s.date = %q{2011-06-22}
   s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
   s.email = %q{jacob.a.perkins@gmail.com}
   s.executables = ["hdp-tree", "hadoop-stream"]
@@ -39,7 +39,6 @@ Gem::Specification.new do |s|
     "lib/swineherd/filesystem/localfilesystem.rb",
     "lib/swineherd/filesystem/localfs.rb",
     "lib/swineherd/filesystem/s3filesystem.rb",
-    "lib/swineherd/foo",
     "lib/swineherd/script.rb",
     "lib/swineherd/script/hadoop_script.rb",
     "lib/swineherd/script/pig_script.rb",
@@ -57,14 +56,11 @@ Gem::Specification.new do |s|
   s.homepage = %q{http://github.com/Ganglion/swineherd}
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.4.2}
+  s.rubygems_version = %q{1.3.7}
   s.summary = %q{Flexible data workflow glue.}
-  s.test_files = [
-    "examples/pagerank/pagerank.rb",
-    "examples/pagerank/scripts/cut_off_list.rb"
-  ]
   if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
     s.specification_version = 3
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: swineherd
 version: !ruby/object:Gem::Version
-  hash: 27
-  prerelease:
+  hash: 23
+  prerelease: false
   segments:
   - 0
   - 0
-  - 2
-  version: 0.0.2
+  - 4
+  version: 0.0.4
 platform: ruby
 authors:
 - Jacob Perkins
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-06-01 00:00:00 +00:00
+date: 2011-06-22 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -152,7 +152,6 @@ files:
 - lib/swineherd/filesystem/localfilesystem.rb
 - lib/swineherd/filesystem/localfs.rb
 - lib/swineherd/filesystem/s3filesystem.rb
-- lib/swineherd/foo
 - lib/swineherd/script.rb
 - lib/swineherd/script/hadoop_script.rb
 - lib/swineherd/script/pig_script.rb
@@ -196,10 +195,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.4.2
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Flexible data workflow glue.
-test_files:
-- examples/pagerank/pagerank.rb
-- examples/pagerank/scripts/cut_off_list.rb
+test_files: []

data/lib/swineherd/foo DELETED

	@@ -1 +0,0 @@
1	- @('_')@