RubyGems - swineherd - Versions diffs - 0.0.1 - Mend

swineherd 0.0.1

Files changed (35) hide show

data/LICENSE +201 -0
data/README.textile +207 -0
data/Rakefile +30 -0
data/VERSION +1 -0
data/bin/hadoop-stream +35 -0
data/bin/hdp-tree +26 -0
data/examples/pagerank/data/seinfeld_network.tsv +429 -0
data/examples/pagerank/pagerank.rb +99 -0
data/examples/pagerank/scripts/cut_off_list.rb +16 -0
data/examples/pagerank/scripts/histogram.R +5 -0
data/examples/pagerank/scripts/pagerank.pig +20 -0
data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
data/lib/swineherd.rb +11 -0
data/lib/swineherd/filesystem.rb +26 -0
data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
data/lib/swineherd/filesystem/filesystems.rb +103 -0
data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
data/lib/swineherd/filesystem/localfs.rb +11 -0
data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
data/lib/swineherd/script.rb +74 -0
data/lib/swineherd/script/hadoop_script.rb +59 -0
data/lib/swineherd/script/pig_script.rb +46 -0
data/lib/swineherd/script/r_script.rb +14 -0
data/lib/swineherd/script/wukong_script.rb +31 -0
data/lib/swineherd/template.rb +45 -0
data/lib/swineherd/workflow.rb +53 -0
data/lib/swineherd/workflow/job.rb +60 -0
data/notes.txt +20 -0
data/swineherd.gemspec +97 -0
data/tests/test_filesystem.rb +105 -0
data/tests/test_s3_filesystem.rb +132 -0
data/tests/testcfg.yaml +7 -0
metadata +204 -0

@@ -0,0 +1,74 @@
+module Swineherd
+  module Script
+    autoload :WukongScript, 'swineherd/script/wukong_script'
+    autoload :PigScript,    'swineherd/script/pig_script'
+    autoload :RScript,      'swineherd/script/r_script'
+    module Common
+      attr_accessor :input, :output, :options, :attributes
+      def initialize(source, input = [], output = [], options = {}, attributes ={})
+        @source     = source
+        @input      = input
+        @output     = output
+        @options    = options
+        @attributes = attributes
+      end
+      #
+      # Allows for setting the environment the script will be ran in
+      #
+      def env
+        ENV
+      end
+      def script
+        @script ||= Template.new(@source, @attributes).substitute!
+      end
+      #
+      # So we can reuse ourselves
+      #
+      def refresh!
+        @script = nil
+        @output = []
+        @input  = []
+      end
+      #
+      # This depends on the type of script
+      #
+      def cmd
+        raise "Override this in subclass!"
+      end
+      #
+      # Override this in subclass to decide how script runs in 'local' mode
+      # Best practice is that it needs to be able to run on a laptop w/o
+      # hadoop.
+      #
+      def local_cmd
+        raise "Override this in subclass!"
+      end
+      #
+      # Default is to run with hadoop
+      #
+      def run mode=:hadoop
+        case mode
+        when :local then
+          sh local_cmd do |res, ok|
+            Log.info("Exit status was #{ok}")
+            raise "Local mode script failed with exit status #{ok}" if ok != 0
+          end
+        when :hadoop then
+          sh cmd do |res, ok|
+            Log.info("Exit status was #{ok}")
+            raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
+          end
+        end
+      end
+    end
+  end
+end

data/lib/swineherd/script/hadoop_script.rb ADDED

@@ -0,0 +1,59 @@
+module Swineherd::Script
+  #
+  # native Java map-reduce
+  #
+  class HadoopScript
+    include Common
+    attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
+    def initialize *args
+      super(*args)
+      @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
+    end
+    #
+    # Converts an arbitrarily nested hash to flattened arguments
+    # for passing to java program. For example:
+    #
+    # {:mapred => {:reduce => {:tasks => 0}}}
+    #
+    # will transform to:
+    #
+    # '-Dmapred.reduce.tasks=0'
+    #
+    def java_args args
+      to_dotted_args(args).map{|arg| "-D#{arg}"}
+    end
+    #
+    # Uses recursion to take an arbitrarily nested hash and
+    # flatten it into dotted args. See 'to_java_args'. Can
+    # you do it any better?
+    #
+    def to_dotted_args args
+      args.map do |k,v|
+        if v.is_a?(Hash)
+          to_dotted_args(v).map do |s|
+            [k,s].join(".")
+          end
+        else
+          "#{k}=#{v}"
+        end
+      end.flatten
+    end
+    def cmd
+      [
+        "HADOOP_CLASSPATH=#{hadoop_classpath}",
+        "#{hadoop_home}/bin/hadoop jar #{run_jar}",
+        main_class,
+        java_args(options),
+        "-libjars #{libjars}",
+        "#{input.join(',')}",
+        "#{output.join(',')}"
+      ].flatten.compact.join(" \t\\\n  ")
+    end
+  end
+end

data/lib/swineherd/script/pig_script.rb ADDED

@@ -0,0 +1,46 @@
+module Swineherd::Script
+  class PigScript
+    include Common
+    #
+    # Not guaranteeing anything.
+    #
+    AVRO_PIG_MAPPING = {
+      'string' => 'chararray',
+      'int'    => 'int',
+      'long'   => 'long',
+      'float'  => 'float',
+      'double' => 'double',
+      'bytes'  => 'bytearray',
+      'fixed'  => 'bytearray'
+    }
+    #
+    # Simple utility function for mapping avro types to pig types
+    #
+    def self.avro_to_pig avro_type
+      AVRO_PIG_MAPPING[avro_type]
+    end
+    #
+    # Convert a generic hash of options {:foo => 'bar'} into
+    # command line options for pig '-p FOO=bar'
+    #
+    def pig_args options
+      options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
+    end
+    def local_cmd
+      Log.info("Launching Pig script in local mode")
+      "pig -x local #{pig_args(@options)} #{script}"
+    end
+    def cmd
+      Log.info("Launching Pig script in hadoop mode")
+      "pig #{pig_args(@options)} #{script}"
+    end
+  end
+end

data/lib/swineherd/script/r_script.rb ADDED

@@ -0,0 +1,14 @@
+module Swineherd::Script
+  class RScript
+    include Common
+    def local_cmd
+      "/usr/bin/Rscript --vanilla #{script}"
+    end
+    def cmd
+      local_cmd
+    end
+  end
+end

data/lib/swineherd/script/wukong_script.rb ADDED

@@ -0,0 +1,31 @@
+require 'pathname'
+module Swineherd::Script
+  class WukongScript
+    include Common
+    def wukong_args options
+      options.map{|param,val| "--#{param}=#{val}" }.join(' ')
+    end
+    #
+    # Don't treat wukong scripts as templates
+    #
+    def script
+      @source
+    end
+    def cmd
+      raise "No wukong input specified" if input.empty?
+      Log.info("Launching Wukong script in hadoop mode")
+      "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
+    end
+    def local_cmd
+      inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
+      Log.info("Launching Wukong script in local mode")
+      "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
+    end
+  end
+end

data/lib/swineherd/template.rb ADDED

@@ -0,0 +1,45 @@
+require 'erubis'
+require 'tempfile'
+# Template.new(script_path, attributes).substitute!
+module Swineherd
+  class Template
+    attr_accessor :source_template, :attributes
+    def initialize source_template, attributes
+      @source_template = source_template
+      @attributes      = attributes
+    end
+    def compile!
+      dest << Erubis::Eruby.new(source).result(attributes)
+      dest << "\n"
+      dest
+    end
+    def substitute!
+      compile!
+      dest.read
+      dest.path
+    end
+    protected
+    def source
+      File.open(source_template).read
+    end
+    def dest
+      return @dest if @dest
+      @dest ||= Tempfile.new(basename)
+    end
+    def basename
+      File.basename(source_template)
+    end
+  end
+end

data/lib/swineherd/workflow.rb ADDED

@@ -0,0 +1,53 @@
+module Swineherd
+  class Workflow
+    attr_accessor :workdir, :outputs, :output_counts
+    #
+    # Create a new workflow and new namespace for this workflow
+    #
+    def initialize flow_id, &blk
+      @flow_id = flow_id
+      @output_counts = Hash.new{|h,k| h[k] = 0}
+      @outputs       = Hash.new{|h,k| h[k] = []}
+      namespace @flow_id do
+        self.instance_eval(&blk)
+      end
+    end
+    #
+    # Get next logical output of taskname by incrementing internal counter
+    #
+    def next_output taskname
+      raise "No working directory specified." unless @workdir
+      @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
+      @output_counts[taskname] += 1
+      latest_output(taskname)
+    end
+    #
+    # Get latest output of taskname
+    #
+    def latest_output taskname
+      @outputs[taskname].last
+    end
+    #
+    # Runs workflow starting with taskname
+    #
+    def run taskname
+      Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
+      Rake::Task["#{@flow_id}:#{taskname}"].invoke
+      Log.info "Workflow task #{@flow_id}:#{taskname} finished"
+    end
+    #
+    # Describes the dependency tree of all tasks belonging to self
+    #
+    def describe
+      Rake::Task.tasks.each do |t|
+        Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
+      end
+    end
+  end
+end

data/lib/swineherd/workflow/job.rb ADDED

@@ -0,0 +1,60 @@
+module Swineherd
+  #
+  # Job class is at its core a rake task
+  #
+  class Job
+    #
+    # Initialize job, fill variables, and create rake task
+    #
+    def initialize job_id, &blk
+      @job_id       = job_id
+      @name         = ''
+      @dependencies = []
+      @script       = ''
+      self.instance_eval(&blk)
+      raketask
+      handle_dependencies
+    end
+    #
+    # Will be the name of the rake task
+    #
+    def name name = nil
+      return @name unless name
+      @name = name
+    end
+    def script script = nil
+      return @script unless script
+      @script = script
+    end
+    #
+    # An array of job names as dependencies
+    #
+    def dependencies dependencies = nil
+      return @dependencies unless dependencies
+      @dependencies = dependencies
+    end
+    def handle_dependencies
+      return if dependencies.empty?
+      task name => dependencies
+    end
+    def cmd
+      @script.cmd
+    end
+    #
+    # Every job is compiled into a rake task
+    #
+    def raketask
+      task name do
+        @script.run
+      end
+    end
+  end
+end

data/notes.txt ADDED

@@ -0,0 +1,20 @@
+Logging:
+1. All output from the launched workflow should go to a workflow log file
+2. Hadoop output is special and should be pulled down from the jobtracker
+   - jobconf.xml
+   - job details page
+Workflow should specify a logdir, defualts to workdir + '/logs'
+Fetching hadoop job stats:
+1. Get job id
+2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
+3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
+4. Fetch the two urls we care about and dump into the workflow's log dir.
+5. Possibly parse the results into an ongoing workflow-statistics.tsv file
+Other output:
+Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.

data/swineherd.gemspec ADDED

@@ -0,0 +1,97 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{swineherd}
+  s.version = "0.0.1"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Jacob Perkins"]
+  s.date = %q{2011-04-20}
+  s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
+  s.email = %q{jacob.a.perkins@gmail.com}
+  s.executables = ["hdp-tree", "hadoop-stream"]
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.textile"
+  ]
+  s.files = [
+    "LICENSE",
+    "README.textile",
+    "Rakefile",
+    "VERSION",
+    "bin/hadoop-stream",
+    "bin/hdp-tree",
+    "examples/pagerank/data/seinfeld_network.tsv",
+    "examples/pagerank/pagerank.rb",
+    "examples/pagerank/scripts/cut_off_list.rb",
+    "examples/pagerank/scripts/histogram.R",
+    "examples/pagerank/scripts/pagerank.pig",
+    "examples/pagerank/scripts/pagerank_initialize.pig",
+    "lib/swineherd.rb",
+    "lib/swineherd/filesystem.rb",
+    "lib/swineherd/filesystem/README_filesystem.textile",
+    "lib/swineherd/filesystem/basefilesystem.rb",
+    "lib/swineherd/filesystem/filesystems.rb",
+    "lib/swineherd/filesystem/hadoopfilesystem.rb",
+    "lib/swineherd/filesystem/localfilesystem.rb",
+    "lib/swineherd/filesystem/localfs.rb",
+    "lib/swineherd/filesystem/s3filesystem.rb",
+    "lib/swineherd/script.rb",
+    "lib/swineherd/script/hadoop_script.rb",
+    "lib/swineherd/script/pig_script.rb",
+    "lib/swineherd/script/r_script.rb",
+    "lib/swineherd/script/wukong_script.rb",
+    "lib/swineherd/template.rb",
+    "lib/swineherd/workflow.rb",
+    "lib/swineherd/workflow/job.rb",
+    "notes.txt",
+    "swineherd.gemspec",
+    "tests/test_filesystem.rb",
+    "tests/test_s3_filesystem.rb",
+    "tests/testcfg.yaml"
+  ]
+  s.homepage = %q{http://github.com/Ganglion/swineherd}
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{Flexible data workflow glue.}
+  s.test_files = [
+    "examples/pagerank/pagerank.rb",
+    "examples/pagerank/scripts/cut_off_list.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+      s.add_runtime_dependency(%q<configliere>, [">= 0"])
+      s.add_runtime_dependency(%q<gorillib>, [">= 0"])
+      s.add_runtime_dependency(%q<erubis>, [">= 0"])
+      s.add_runtime_dependency(%q<right_aws>, [">= 0"])
+    else
+      s.add_dependency(%q<yard>, ["~> 0.6.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+      s.add_dependency(%q<configliere>, [">= 0"])
+      s.add_dependency(%q<gorillib>, [">= 0"])
+      s.add_dependency(%q<erubis>, [">= 0"])
+      s.add_dependency(%q<right_aws>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<yard>, ["~> 0.6.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+    s.add_dependency(%q<configliere>, [">= 0"])
+    s.add_dependency(%q<gorillib>, [">= 0"])
+    s.add_dependency(%q<erubis>, [">= 0"])
+    s.add_dependency(%q<right_aws>, [">= 0"])
+  end
+end