RubyGems - swineherd - Versions diffs - 0.0.1 - Mend

swineherd 0.0.1

Files changed (35) hide show

data/LICENSE +201 -0
data/README.textile +207 -0
data/Rakefile +30 -0
data/VERSION +1 -0
data/bin/hadoop-stream +35 -0
data/bin/hdp-tree +26 -0
data/examples/pagerank/data/seinfeld_network.tsv +429 -0
data/examples/pagerank/pagerank.rb +99 -0
data/examples/pagerank/scripts/cut_off_list.rb +16 -0
data/examples/pagerank/scripts/histogram.R +5 -0
data/examples/pagerank/scripts/pagerank.pig +20 -0
data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
data/lib/swineherd.rb +11 -0
data/lib/swineherd/filesystem.rb +26 -0
data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
data/lib/swineherd/filesystem/filesystems.rb +103 -0
data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
data/lib/swineherd/filesystem/localfs.rb +11 -0
data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
data/lib/swineherd/script.rb +74 -0
data/lib/swineherd/script/hadoop_script.rb +59 -0
data/lib/swineherd/script/pig_script.rb +46 -0
data/lib/swineherd/script/r_script.rb +14 -0
data/lib/swineherd/script/wukong_script.rb +31 -0
data/lib/swineherd/template.rb +45 -0
data/lib/swineherd/workflow.rb +53 -0
data/lib/swineherd/workflow/job.rb +60 -0
data/notes.txt +20 -0
data/swineherd.gemspec +97 -0
data/tests/test_filesystem.rb +105 -0
data/tests/test_s3_filesystem.rb +132 -0
data/tests/testcfg.yaml +7 -0
metadata +204 -0

data/lib/swineherd/filesystem/hadoopfilesystem.rb ADDED

@@ -0,0 +1,263 @@
+module Swineherd
+  #
+  # Methods for dealing with hadoop distributed file system (hdfs). This class
+  # requires that you run with JRuby as it makes use of the native java hadoop
+  # libraries.
+  #
+  class HadoopFileSystem
+    include Swineherd::BaseFileSystem
+    attr_accessor :conf, :hdfs
+    #
+    # Initialize a new hadoop file system, needs path to hadoop configuration
+    #
+    def initialize *args
+      check_and_set_environment
+      @conf = Java::org.apache.hadoop.conf.Configuration.new
+      @hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
+    end
+    #
+    # Make sure environment is sane then set up environment for use
+    #
+    def check_and_set_environment
+      check_env
+      set_env
+    end
+    def open path, mode="r", &blk
+      HadoopFile.new(path,mode,self,&blk)
+    end
+    def rm path
+      @hdfs.delete(Path.new(path), true)
+      [path]
+    end
+    def exists? path
+      @hdfs.exists(Path.new(path))
+    end
+    def mv srcpath, dstpath
+      @hdfs.rename(Path.new(srcpath), Path.new(dstpath))
+    end
+    def cp srcpath, dstpath
+      FileUtil.copy(@hdfs, Path.new(srcpath), @hdfs, Path.new(dstpath), false, @conf)
+    end
+    def mkpath path
+      @hdfs.mkdirs(Path.new(path))
+      path
+    end
+    def type path
+      return "unknown" unless exists? path
+      status = @hdfs.get_file_status(Path.new(path))
+      return "directory" if status.is_dir?
+      "file"
+      # case
+      # when status.isFile then
+      #   return "file"
+      # when status.is_directory? then
+      #   return "directory"
+      # when status.is_symlink? then
+      #   return "symlink"
+      # end
+    end
+    def entries dirpath
+      return unless type(dirpath) == "directory"
+      list = @hdfs.list_status(Path.new(dirpath))
+      list.map{|path| path.get_path.to_s} rescue []
+    end
+    #
+    # Merge all part files in a directory into one file.
+    #
+    def merge srcdir, dstfile
+      FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
+    end
+    #
+    # This is hackety. Use with caution.
+    #
+    def stream input, output
+      require 'uri'
+      input_fs_scheme  = URI.parse(input).scheme
+      output_fs_scheme = URI.parse(output).scheme
+      system("#{@hadoop_home}/bin/hadoop \\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar                     \\
+       -D          mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
+       -D          mapred.min.split.size=1000000000                                            \\
+       -D          mapred.reduce.tasks=0                                                       \\
+       -mapper     \"/bin/cat\"                                                                \\
+       -input      \"#{input}\"                                                                \\
+       -output     \"#{output}\"")
+    end
+    #
+    # BZIP
+    #
+    def bzip input, output
+      system("#{@hadoop_home}/bin/hadoop \\
+       jar         #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar	\\
+       -D          mapred.output.compress=true                                  \\
+       -D          mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec  \\
+       -D          mapred.reduce.tasks=1                                        \\
+       -mapper     \"/bin/cat\"                                                 \\
+       -reducer	   \"/bin/cat\"                                                 \\
+       -input      \"#{input}\"                                                 \\
+       -output     \"#{output}\"")
+    end
+    #
+    # Copy hdfs file to local filesystem
+    #
+    def copy_to_local srcfile, dstfile
+      @hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
+    end
+    #
+    # Copyy local file to hdfs filesystem
+    #
+    def copy_from_local srcfile, dstfile
+      @hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
+    end
+    def close *args
+      @hdfs.close
+    end
+    class HadoopFile
+      attr_accessor :path, :handle, :hdfs
+      #
+      # In order to open input and output streams we must pass around the hadoop fs object itself
+      #
+      def initialize path, mode, fs, &blk
+        @fs   = fs
+        @path = Path.new(path)
+        case mode
+        when "r" then
+          raise "#{@fs.type(path)} is not a readable file - #{path}" unless @fs.type(path) == "file"
+          @handle = @fs.hdfs.open(@path).to_io(&blk)
+        when "w" then
+          # Open path for writing
+          raise "Path #{path} is a directory." unless (@fs.type(path) == "file") || (@fs.type(path) == "unknown")
+          @handle = @fs.hdfs.create(@path).to_io.to_outputstream
+          if block_given?
+            yield self
+            self.close # muy muy importante
+          end
+        end
+      end
+      def read
+        @handle.read
+      end
+      def readline
+        @handle.readline
+      end
+      def write string
+        @handle.write(string.to_java_string.get_bytes)
+      end
+      def puts string
+        write(string+"\n")
+      end
+      def close
+        @handle.close
+      end
+    end
+    # #
+    # # Distributed streaming from input to output
+    # #
+    #
+    # #
+    # # Given an array of input dirs, stream all into output dir and remove duplicate records.
+    # # Reasonable default hadoop streaming options are chosen.
+    # #
+    # def self.merge inputs, output, options = {}
+    #   options[:reduce_tasks]     ||= 25
+    #   options[:partition_fields] ||= 2
+    #   options[:sort_fields]      ||= 2
+    #   options[:field_separator]  ||= '/t'
+    #   names = inputs.map{|inp| File.basename(inp)}.join(',')
+    #   cmd   = "${HADOOP_HOME}/bin/hadoop \\
+    #    jar         ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar                   \\
+    #    -D          mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\"               \\
+    #    -D          num.key.fields.for.partition=\"#{options[:partition_fields]}\"            \\
+    #    -D          stream.num.map.output.key.fields=\"#{options[:sort_fields]}\"             \\
+    #    -D          mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
+    #    -D          stream.map.output.field.separator=\"'#{options[:field_separator]}'\"      \\
+    #    -D          mapred.min.split.size=1000000000                                          \\
+    #    -D          mapred.reduce.tasks=#{options[:reduce_tasks]}                             \\
+    #    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner                    \\
+    #    -mapper     \"/bin/cat\"                                                              \\
+    #    -reducer    \"/usr/bin/uniq\"                                                         \\
+    #    -input      \"#{inputs.join(',')}\"                                                   \\
+    #    -output     \"#{output}\""
+    #   puts cmd
+    #   system cmd
+    # end
+    #
+    # #
+    # # Concatenates a hadoop dir or file into a local file
+    # #
+    # def self.cat_to_local src, dest
+    #   system %Q{hadoop fs -cat #{src}/[^_]* > #{dest}} unless File.exist?(dest)
+    # end
+    #
+    #
+    # Check that we are running with jruby, check for hadoop home. hadoop_home
+    # is preferentially set to the HADOOP_HOME environment variable if it's set,
+    # '/usr/local/share/hadoop' if HADOOP_HOME isn't defined, and
+    # '/usr/lib/hadoop' if '/usr/local/share/hadoop' doesn't exist. If all else
+    # fails inform the user that HADOOP_HOME really should be set.
+    #
+    def check_env
+      begin
+        require 'java'
+      rescue LoadError => e
+        raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
+      end
+      @hadoop_home = (ENV['HADOOP_HOME'] || '/usr/local/share/hadoop')
+      @hadoop_home = '/usr/lib/hadoop' unless File.exist? @hadoop_home
+      raise "\nHadoop installation not found, try setting HADOOP_HOME\n" unless File.exist? @hadoop_home
+    end
+    #
+    # Place hadoop jars in class path, require appropriate jars, set hadoop conf
+    #
+    def set_env
+      require 'java'
+      @hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
+      @hadoop_conf += "/" unless @hadoop_conf.end_with? "/"
+      $CLASSPATH << @hadoop_conf
+      Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
+      java_import 'org.apache.hadoop.conf.Configuration'
+      java_import 'org.apache.hadoop.fs.Path'
+      java_import 'org.apache.hadoop.fs.FileSystem'
+      java_import 'org.apache.hadoop.fs.FileUtil'
+      java_import 'org.apache.hadoop.mapreduce.lib.input.FileInputFormat'
+      java_import 'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat'
+      java_import 'org.apache.hadoop.fs.FSDataOutputStream'
+      java_import 'org.apache.hadoop.fs.FSDataInputStream'
+    end
+  end
+end

data/lib/swineherd/filesystem/localfilesystem.rb ADDED

@@ -0,0 +1,83 @@
+require 'fileutils'
+module Swineherd
+  class LocalFileSystem
+    include Swineherd::BaseFileSystem
+    def initialize *args
+    end
+    def open path, mode="r", &blk
+      return LocalFile.new path, mode, &blk
+    end
+    def rm path
+      FileUtils.rm_r path
+    end
+    def exists? path
+      File.exists?(path)
+    end
+    def mv srcpath, dstpath
+      FileUtils.mv(srcpath,dstpath)
+    end
+    def cp srcpath, dstpath
+      FileUtils.cp_r(srcpath,dstpath)
+    end
+    def mkpath path
+      FileUtils.mkpath path
+    end
+    def type path
+      case
+      when File.symlink?(path) then
+        return "symlink"
+      when File.directory?(path) then
+        return "directory"
+      when File.file?(path) then
+          return "file"
+      end
+      "unknown"
+    end
+    def entries dirpath
+      return unless (type(dirpath) == "directory")
+      Dir.entries(dirpath)
+    end
+    class LocalFile
+      attr_accessor :path, :scheme, :handle, :mode
+      def initialize path, mode="r", &blk
+        @path   = path
+        @mode   = mode
+        @handle = File.open(path,mode,&blk)
+      end
+      def open path, mode="r", &blk
+        initialize(path,mode,&blk)
+      end
+      def read
+        @handle.read
+      end
+      def readline
+        @handle.gets
+      end
+      def write string
+        @handle.write(string)
+      end
+      def close
+        @handle.close
+      end
+    end
+  end
+end

data/lib/swineherd/filesystem/localfs.rb ADDED

@@ -0,0 +1,11 @@
+module Swineherd
+  class LocalFS
+    def self.check_paths paths
+      exist_count   = 0 # no outputs exist
+      paths.each{|path| exist_count += 1 if File.exist?(path) }
+      raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
+      return true if exist_count == 0
+      false
+    end
+  end
+end

data/lib/swineherd/filesystem/s3filesystem.rb ADDED

@@ -0,0 +1,249 @@
+require 'tempfile'
+module Swineherd
+  #
+  # Methods for interacting with Amazon's Simple Store Service (s3).
+  #
+  class S3FileSystem
+    include Swineherd::BaseFileSystem
+    attr_accessor :s3
+    #
+    # Initialize a new s3 file system, needs path to aws keys
+    #
+    def initialize aws_access_key_id, aws_secret_access_key
+      require 'right_aws'
+      @s3 = RightAws::S3.new(aws_access_key_id, aws_secret_access_key)
+    end
+    def open path, mode="r", &blk
+      S3File.new(path,mode,self,&blk)
+    end
+    def rm path
+      bkt = bucket(path)
+      key = key_path(path)
+      if key.empty? # only the bucket was passed in, delete it
+        @s3.interface.force_delete_bucket(bkt)
+      else
+        case type(path)
+        when "directory" then
+          keys_to_delete = lr(path)
+          keys_to_delete.each do |k|
+            key_to_delete = key_path(k)
+            @s3.interface.delete(bkt, key_to_delete)
+          end
+          keys_to_delete
+        when "file" then
+          @s3.interface.delete(bkt, key)
+          [path]
+        end
+      end
+    end
+    def bucket path
+      uri = URI.parse(path)
+      uri.path.split('/').reject{|x| x.empty?}.first
+    end
+    def key_path path
+      uri = URI.parse(path)
+      File.join(uri.path.split('/').reject{|x| x.empty?}[1..-1])
+    end
+    def needs_trailing_slash pre
+      has_trailing_slash = pre.end_with? '/'
+      is_empty_prefix    = pre.empty?
+      !(has_trailing_slash || is_empty_prefix)
+    end
+    def full_contents path
+      bkt = bucket(path)
+      pre = key_path(path)
+      pre += '/' if needs_trailing_slash(pre)
+      contents = []
+      s3.interface.incrementally_list_bucket(bkt, {'prefix' => pre, 'delimiter' => '/'}) do |res|
+        contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
+        contents += res[:contents].map{|c| File.join(bkt, c[:key])}
+      end
+      contents
+    end
+    def exists? path
+      object     = File.basename(path)
+      search_dir = File.dirname(path)
+      case search_dir
+      when '.' then # only a bucket was passed in
+        begin
+          (full_contents(object).size > 0)
+        rescue RightAws::AwsError => e
+          if e.message =~ /nosuchbucket/i
+            false
+          else
+            raise e
+          end
+        end
+      else
+        search_dir_contents = full_contents(search_dir).map{|c| File.basename(c).gsub(/\//, '')}
+        search_dir_contents.include?(object)
+      end
+    end
+    def mv srcpath, dstpath
+      src_bucket   = bucket(srcpath)
+      dst_bucket   = bucket(dstpath)
+      dst_key_path = key_path(dstpath)
+      mkpath(dstpath)
+      case type(srcpath)
+      when "directory" then
+        paths_to_copy = lr(srcpath)
+        common_dir    = common_directory(paths_to_copy)
+        paths_to_copy.each do |path|
+          src_key = key_path(path)
+          dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
+          @s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
+        end
+      when "file" then
+        @s3.interface.move(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
+      end
+    end
+    def cp srcpath, dstpath
+      src_bucket   = bucket(srcpath)
+      dst_bucket   = bucket(dstpath)
+      dst_key_path = key_path(dstpath)
+      mkpath(dstpath)
+      case type(srcpath)
+      when "directory" then
+        paths_to_copy = lr(srcpath)
+        common_dir    = common_directory(paths_to_copy)
+        paths_to_copy.each do |path|
+          src_key = key_path(path)
+          dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
+          @s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
+        end
+      when "file" then
+        @s3.interface.copy(src_bucket, key_path(srcpath), dst_bucket, dst_key_path)
+      end
+    end
+    #
+    # This is a bit funny, there's actually no need to create a 'path' since
+    # s3 is nothing more than a glorified key-value store. When you create a
+    # 'file' (key) the 'path' will be created for you. All we do here is create
+    # the bucket unless it already exists.
+    #
+    def mkpath path
+      bkt = bucket(path)
+      key = key_path(path)
+      if key.empty?
+        @s3.interface.create_bucket(bkt)
+      else
+        @s3.interface.create_bucket(bkt) unless exists? bkt
+      end
+      path
+    end
+    def type path
+      return "unknown" unless exists? path
+      return "directory" if full_contents(path).size > 0
+      "file"
+    end
+    def entries dirpath
+      return unless type(dirpath) == "directory"
+      full_contents(dirpath)
+    end
+    # Recursively list paths
+    def lr path
+      paths = entries(path)
+      if paths
+        paths.map{|e| lr(e)}.flatten
+      else
+        path
+      end
+    end
+    #
+    # Ick.
+    #
+    def common_directory paths
+      dirs     = paths.map{|path| path.split('/')}
+      min_size = dirs.map{|splits| splits.size}.min
+      dirs.map!{|splits| splits[0...min_size]}
+      uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
+      dirs[0][0...uncommon_idx].join('/')
+    end
+    def close *args
+    end
+    class S3File
+      attr_accessor :path, :handle, :fs
+      #
+      # In order to open input and output streams we must pass around the s3 fs object itself
+      #
+      def initialize path, mode, fs, &blk
+        @fs   = fs
+        @path = path
+        case mode
+        when "r" then
+          raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
+        when "w" then
+          raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
+          @handle = Tempfile.new('s3filestream')
+          if block_given?
+            yield self
+            close
+          end
+        end
+      end
+      #
+      # Faster than iterating
+      #
+      def read
+        resp = fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path))
+        resp
+      end
+      #
+      # This is a little hackety. That is, once you call (.each) on the object the full object starts
+      # downloading...
+      #
+      def readline
+        @handle ||= fs.s3.interface.get_object(fs.bucket(path), fs.key_path(path)).each
+        begin
+          @handle.next
+        rescue StopIteration, NoMethodError
+          @handle = nil
+          raise EOFError.new("end of file reached")
+        end
+      end
+      def write string
+        @handle.write(string)
+      end
+      def puts string
+        write(string+"\n")
+      end
+      def close
+        if @handle
+          @handle.read
+          fs.s3.interface.put(fs.bucket(path), fs.key_path(path), File.open(@handle.path, 'r'))
+          @handle.close
+        end
+        @handle = nil
+      end
+    end
+  end
+end