RubyGems - swineherd - Versions diffs - 0.0.1 - Mend

swineherd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/LICENSE +201 -0
data/README.textile +207 -0
data/Rakefile +30 -0
data/VERSION +1 -0
data/bin/hadoop-stream +35 -0
data/bin/hdp-tree +26 -0
data/examples/pagerank/data/seinfeld_network.tsv +429 -0
data/examples/pagerank/pagerank.rb +99 -0
data/examples/pagerank/scripts/cut_off_list.rb +16 -0
data/examples/pagerank/scripts/histogram.R +5 -0
data/examples/pagerank/scripts/pagerank.pig +20 -0
data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
data/lib/swineherd.rb +11 -0
data/lib/swineherd/filesystem.rb +26 -0
data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
data/lib/swineherd/filesystem/filesystems.rb +103 -0
data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
data/lib/swineherd/filesystem/localfs.rb +11 -0
data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
data/lib/swineherd/script.rb +74 -0
data/lib/swineherd/script/hadoop_script.rb +59 -0
data/lib/swineherd/script/pig_script.rb +46 -0
data/lib/swineherd/script/r_script.rb +14 -0
data/lib/swineherd/script/wukong_script.rb +31 -0
data/lib/swineherd/template.rb +45 -0
data/lib/swineherd/workflow.rb +53 -0
data/lib/swineherd/workflow/job.rb +60 -0
data/notes.txt +20 -0
data/swineherd.gemspec +97 -0
data/tests/test_filesystem.rb +105 -0
data/tests/test_s3_filesystem.rb +132 -0
data/tests/testcfg.yaml +7 -0
metadata +204 -0

data/lib/swineherd/script.rb ADDED

@@ -0,0 +1,74 @@
+module Swineherd
+  module Script
+    autoload :WukongScript, 'swineherd/script/wukong_script'
+    autoload :PigScript,    'swineherd/script/pig_script'
+    autoload :RScript,      'swineherd/script/r_script'
+    module Common
+      attr_accessor :input, :output, :options, :attributes
+      def initialize(source, input = [], output = [], options = {}, attributes ={})
+        @source     = source
+        @input      = input
+        @output     = output
+        @options    = options
+        @attributes = attributes
+      end
+      #
+      # Allows for setting the environment the script will be ran in
+      #
+      def env
+        ENV
+      end
+      def script
+        @script ||= Template.new(@source, @attributes).substitute!
+      end
+      #
+      # So we can reuse ourselves
+      #
+      def refresh!
+        @script = nil
+        @output = []
+        @input  = []
+      end
+      #
+      # This depends on the type of script
+      #
+      def cmd
+        raise "Override this in subclass!"
+      end
+      #
+      # Override this in subclass to decide how script runs in 'local' mode
+      # Best practice is that it needs to be able to run on a laptop w/o
+      # hadoop.
+      #
+      def local_cmd
+        raise "Override this in subclass!"
+      end
+      #
+      # Default is to run with hadoop
+      #
+      def run mode=:hadoop
+        case mode
+        when :local then
+          sh local_cmd do |res, ok|
+            Log.info("Exit status was #{ok}")
+            raise "Local mode script failed with exit status #{ok}" if ok != 0
+          end
+        when :hadoop then
+          sh cmd do |res, ok|
+            Log.info("Exit status was #{ok}")
+            raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
+          end
+        end
+      end
+    end
+  end
+end

data/lib/swineherd/script/hadoop_script.rb ADDED

@@ -0,0 +1,59 @@
+module Swineherd::Script
+  #
+  # native Java map-reduce
+  #
+  class HadoopScript
+    include Common
+    attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
+    def initialize *args
+      super(*args)
+      @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
+    end
+    #
+    # Converts an arbitrarily nested hash to flattened arguments
+    # for passing to java program. For example:
+    #
+    # {:mapred => {:reduce => {:tasks => 0}}}
+    #
+    # will transform to:
+    #
+    # '-Dmapred.reduce.tasks=0'
+    #
+    def java_args args
+      to_dotted_args(args).map{|arg| "-D#{arg}"}
+    end
+    #
+    # Uses recursion to take an arbitrarily nested hash and
+    # flatten it into dotted args. See 'to_java_args'. Can
+    # you do it any better?
+    #
+    def to_dotted_args args
+      args.map do |k,v|
+        if v.is_a?(Hash)
+          to_dotted_args(v).map do |s|
+            [k,s].join(".")
+          end
+        else
+          "#{k}=#{v}"
+        end
+      end.flatten
+    end
+    def cmd
+      [
+        "HADOOP_CLASSPATH=#{hadoop_classpath}",
+        "#{hadoop_home}/bin/hadoop jar #{run_jar}",
+        main_class,
+        java_args(options),
+        "-libjars #{libjars}",
+        "#{input.join(',')}",
+        "#{output.join(',')}"
+      ].flatten.compact.join(" \t\\\n  ")
+    end
+  end
+end

data/lib/swineherd/script/pig_script.rb ADDED

@@ -0,0 +1,46 @@
+module Swineherd::Script
+  class PigScript
+    include Common
+    #
+    # Not guaranteeing anything.
+    #
+    AVRO_PIG_MAPPING = {
+      'string' => 'chararray',
+      'int'    => 'int',
+      'long'   => 'long',
+      'float'  => 'float',
+      'double' => 'double',
+      'bytes'  => 'bytearray',
+      'fixed'  => 'bytearray'
+    }
+    #
+    # Simple utility function for mapping avro types to pig types
+    #
+    def self.avro_to_pig avro_type
+      AVRO_PIG_MAPPING[avro_type]
+    end
+    #
+    # Convert a generic hash of options {:foo => 'bar'} into
+    # command line options for pig '-p FOO=bar'
+    #
+    def pig_args options
+      options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
+    end
+    def local_cmd
+      Log.info("Launching Pig script in local mode")
+      "pig -x local #{pig_args(@options)} #{script}"
+    end
+    def cmd
+      Log.info("Launching Pig script in hadoop mode")
+      "pig #{pig_args(@options)} #{script}"
+    end
+  end
+end

data/lib/swineherd/script/r_script.rb ADDED

@@ -0,0 +1,14 @@
+module Swineherd::Script
+  class RScript
+    include Common
+    def local_cmd
+      "/usr/bin/Rscript --vanilla #{script}"
+    end
+    def cmd
+      local_cmd
+    end
+  end
+end

data/lib/swineherd/script/wukong_script.rb ADDED

@@ -0,0 +1,31 @@
+require 'pathname'
+module Swineherd::Script
+  class WukongScript
+    include Common
+    def wukong_args options
+      options.map{|param,val| "--#{param}=#{val}" }.join(' ')
+    end
+    #
+    # Don't treat wukong scripts as templates
+    #
+    def script
+      @source
+    end
+    def cmd
+      raise "No wukong input specified" if input.empty?
+      Log.info("Launching Wukong script in hadoop mode")
+      "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
+    end
+    def local_cmd
+      inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
+      Log.info("Launching Wukong script in local mode")
+      "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
+    end
+  end
+end

data/lib/swineherd/template.rb ADDED

@@ -0,0 +1,45 @@
+require 'erubis'
+require 'tempfile'
+# Template.new(script_path, attributes).substitute!
+module Swineherd
+  class Template
+    attr_accessor :source_template, :attributes
+    def initialize source_template, attributes
+      @source_template = source_template
+      @attributes      = attributes
+    end
+    def compile!
+      dest << Erubis::Eruby.new(source).result(attributes)
+      dest << "\n"
+      dest
+    end
+    def substitute!
+      compile!
+      dest.read
+      dest.path
+    end
+    protected
+    def source
+      File.open(source_template).read
+    end
+    def dest
+      return @dest if @dest
+      @dest ||= Tempfile.new(basename)
+    end
+    def basename
+      File.basename(source_template)
+    end
+  end
+end

data/lib/swineherd/workflow.rb ADDED

@@ -0,0 +1,53 @@
+module Swineherd
+  class Workflow
+    attr_accessor :workdir, :outputs, :output_counts
+    #
+    # Create a new workflow and new namespace for this workflow
+    #
+    def initialize flow_id, &blk
+      @flow_id = flow_id
+      @output_counts = Hash.new{|h,k| h[k] = 0}
+      @outputs       = Hash.new{|h,k| h[k] = []}
+      namespace @flow_id do
+        self.instance_eval(&blk)
+      end
+    end
+    #
+    # Get next logical output of taskname by incrementing internal counter
+    #
+    def next_output taskname
+      raise "No working directory specified." unless @workdir
+      @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
+      @output_counts[taskname] += 1
+      latest_output(taskname)
+    end
+    #
+    # Get latest output of taskname
+    #
+    def latest_output taskname
+      @outputs[taskname].last
+    end
+    #
+    # Runs workflow starting with taskname
+    #
+    def run taskname
+      Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
+      Rake::Task["#{@flow_id}:#{taskname}"].invoke
+      Log.info "Workflow task #{@flow_id}:#{taskname} finished"
+    end
+    #
+    # Describes the dependency tree of all tasks belonging to self
+    #
+    def describe
+      Rake::Task.tasks.each do |t|
+        Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
+      end
+    end
+  end
+end

data/lib/swineherd/workflow/job.rb ADDED

@@ -0,0 +1,60 @@
+module Swineherd
+  #
+  # Job class is at its core a rake task
+  #
+  class Job
+    #
+    # Initialize job, fill variables, and create rake task
+    #
+    def initialize job_id, &blk
+      @job_id       = job_id
+      @name         = ''
+      @dependencies = []
+      @script       = ''
+      self.instance_eval(&blk)
+      raketask
+      handle_dependencies
+    end
+    #
+    # Will be the name of the rake task
+    #
+    def name name = nil
+      return @name unless name
+      @name = name
+    end
+    def script script = nil
+      return @script unless script
+      @script = script
+    end
+    #
+    # An array of job names as dependencies
+    #
+    def dependencies dependencies = nil
+      return @dependencies unless dependencies
+      @dependencies = dependencies
+    end
+    def handle_dependencies
+      return if dependencies.empty?
+      task name => dependencies
+    end
+    def cmd
+      @script.cmd
+    end
+    #
+    # Every job is compiled into a rake task
+    #
+    def raketask
+      task name do
+        @script.run
+      end
+    end
+  end
+end

data/notes.txt ADDED

@@ -0,0 +1,20 @@
+Logging:
+1. All output from the launched workflow should go to a workflow log file
+2. Hadoop output is special and should be pulled down from the jobtracker
+   - jobconf.xml
+   - job details page
+Workflow should specify a logdir, defualts to workdir + '/logs'
+Fetching hadoop job stats:
+1. Get job id
+2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
+3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
+4. Fetch the two urls we care about and dump into the workflow's log dir.
+5. Possibly parse the results into an ongoing workflow-statistics.tsv file
+Other output:
+Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.

data/swineherd.gemspec ADDED

@@ -0,0 +1,97 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{swineherd}
+  s.version = "0.0.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Jacob Perkins"]
+  s.date = %q{2011-04-20}
+  s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
+  s.email = %q{jacob.a.perkins@gmail.com}
+  s.executables = ["hdp-tree", "hadoop-stream"]
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.textile"
+  ]
+  s.files = [
+    "LICENSE",
+    "README.textile",
+    "Rakefile",
+    "VERSION",
+    "bin/hadoop-stream",
+    "bin/hdp-tree",
+    "examples/pagerank/data/seinfeld_network.tsv",
+    "examples/pagerank/pagerank.rb",
+    "examples/pagerank/scripts/cut_off_list.rb",
+    "examples/pagerank/scripts/histogram.R",
+    "examples/pagerank/scripts/pagerank.pig",
+    "examples/pagerank/scripts/pagerank_initialize.pig",
+    "lib/swineherd.rb",
+    "lib/swineherd/filesystem.rb",
+    "lib/swineherd/filesystem/README_filesystem.textile",
+    "lib/swineherd/filesystem/basefilesystem.rb",
+    "lib/swineherd/filesystem/filesystems.rb",
+    "lib/swineherd/filesystem/hadoopfilesystem.rb",
+    "lib/swineherd/filesystem/localfilesystem.rb",
+    "lib/swineherd/filesystem/localfs.rb",
+    "lib/swineherd/filesystem/s3filesystem.rb",
+    "lib/swineherd/script.rb",
+    "lib/swineherd/script/hadoop_script.rb",
+    "lib/swineherd/script/pig_script.rb",
+    "lib/swineherd/script/r_script.rb",
+    "lib/swineherd/script/wukong_script.rb",
+    "lib/swineherd/template.rb",
+    "lib/swineherd/workflow.rb",
+    "lib/swineherd/workflow/job.rb",
+    "notes.txt",
+    "swineherd.gemspec",
+    "tests/test_filesystem.rb",
+    "tests/test_s3_filesystem.rb",
+    "tests/testcfg.yaml"
+  ]
+  s.homepage = %q{http://github.com/Ganglion/swineherd}
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{Flexible data workflow glue.}
+  s.test_files = [
+    "examples/pagerank/pagerank.rb",
+    "examples/pagerank/scripts/cut_off_list.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+      s.add_runtime_dependency(%q<configliere>, [">= 0"])
+      s.add_runtime_dependency(%q<gorillib>, [">= 0"])
+      s.add_runtime_dependency(%q<erubis>, [">= 0"])
+      s.add_runtime_dependency(%q<right_aws>, [">= 0"])
+    else
+      s.add_dependency(%q<yard>, ["~> 0.6.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+      s.add_dependency(%q<configliere>, [">= 0"])
+      s.add_dependency(%q<gorillib>, [">= 0"])
+      s.add_dependency(%q<erubis>, [">= 0"])
+      s.add_dependency(%q<right_aws>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<yard>, ["~> 0.6.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+    s.add_dependency(%q<configliere>, [">= 0"])
+    s.add_dependency(%q<gorillib>, [">= 0"])
+    s.add_dependency(%q<erubis>, [">= 0"])
+    s.add_dependency(%q<right_aws>, [">= 0"])
+  end
+end