swineherd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ module Swineherd
2
+ module Script
3
+
4
+ autoload :WukongScript, 'swineherd/script/wukong_script'
5
+ autoload :PigScript, 'swineherd/script/pig_script'
6
+ autoload :RScript, 'swineherd/script/r_script'
7
+
8
+ module Common
9
+ attr_accessor :input, :output, :options, :attributes
10
+ def initialize(source, input = [], output = [], options = {}, attributes ={})
11
+ @source = source
12
+ @input = input
13
+ @output = output
14
+ @options = options
15
+ @attributes = attributes
16
+ end
17
+
18
+ #
19
+ # Allows for setting the environment the script will be ran in
20
+ #
21
+ def env
22
+ ENV
23
+ end
24
+
25
+ def script
26
+ @script ||= Template.new(@source, @attributes).substitute!
27
+ end
28
+
29
+ #
30
+ # So we can reuse ourselves
31
+ #
32
+ def refresh!
33
+ @script = nil
34
+ @output = []
35
+ @input = []
36
+ end
37
+
38
+ #
39
+ # This depends on the type of script
40
+ #
41
+ def cmd
42
+ raise "Override this in subclass!"
43
+ end
44
+
45
+ #
46
+ # Override this in subclass to decide how script runs in 'local' mode
47
+ # Best practice is that it needs to be able to run on a laptop w/o
48
+ # hadoop.
49
+ #
50
+ def local_cmd
51
+ raise "Override this in subclass!"
52
+ end
53
+
54
+ #
55
+ # Default is to run with hadoop
56
+ #
57
+ def run mode=:hadoop
58
+ case mode
59
+ when :local then
60
+ sh local_cmd do |res, ok|
61
+ Log.info("Exit status was #{ok}")
62
+ raise "Local mode script failed with exit status #{ok}" if ok != 0
63
+ end
64
+ when :hadoop then
65
+ sh cmd do |res, ok|
66
+ Log.info("Exit status was #{ok}")
67
+ raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
68
+ end
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,59 @@
1
+ module Swineherd::Script
2
+
3
+ #
4
+ # native Java map-reduce
5
+ #
6
+ class HadoopScript
7
+ include Common
8
+ attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
9
+
10
+ def initialize *args
11
+ super(*args)
12
+ @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
13
+ end
14
+
15
+ #
16
+ # Converts an arbitrarily nested hash to flattened arguments
17
+ # for passing to java program. For example:
18
+ #
19
+ # {:mapred => {:reduce => {:tasks => 0}}}
20
+ #
21
+ # will transform to:
22
+ #
23
+ # '-Dmapred.reduce.tasks=0'
24
+ #
25
+ def java_args args
26
+ to_dotted_args(args).map{|arg| "-D#{arg}"}
27
+ end
28
+
29
+ #
30
+ # Uses recursion to take an arbitrarily nested hash and
31
+ # flatten it into dotted args. See 'to_java_args'. Can
32
+ # you do it any better?
33
+ #
34
+ def to_dotted_args args
35
+ args.map do |k,v|
36
+ if v.is_a?(Hash)
37
+ to_dotted_args(v).map do |s|
38
+ [k,s].join(".")
39
+ end
40
+ else
41
+ "#{k}=#{v}"
42
+ end
43
+ end.flatten
44
+ end
45
+
46
+ def cmd
47
+ [
48
+ "HADOOP_CLASSPATH=#{hadoop_classpath}",
49
+ "#{hadoop_home}/bin/hadoop jar #{run_jar}",
50
+ main_class,
51
+ java_args(options),
52
+ "-libjars #{libjars}",
53
+ "#{input.join(',')}",
54
+ "#{output.join(',')}"
55
+ ].flatten.compact.join(" \t\\\n ")
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,46 @@
1
+ module Swineherd::Script
2
+ class PigScript
3
+ include Common
4
+
5
+ #
6
+ # Not guaranteeing anything.
7
+ #
8
+ AVRO_PIG_MAPPING = {
9
+ 'string' => 'chararray',
10
+ 'int' => 'int',
11
+ 'long' => 'long',
12
+ 'float' => 'float',
13
+ 'double' => 'double',
14
+ 'bytes' => 'bytearray',
15
+ 'fixed' => 'bytearray'
16
+ }
17
+
18
+ #
19
+ # Simple utility function for mapping avro types to pig types
20
+ #
21
+ def self.avro_to_pig avro_type
22
+ AVRO_PIG_MAPPING[avro_type]
23
+ end
24
+
25
+ #
26
+ # Convert a generic hash of options {:foo => 'bar'} into
27
+ # command line options for pig '-p FOO=bar'
28
+ #
29
+ def pig_args options
30
+ options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
31
+ end
32
+
33
+
34
+
35
+ def local_cmd
36
+ Log.info("Launching Pig script in local mode")
37
+ "pig -x local #{pig_args(@options)} #{script}"
38
+ end
39
+
40
+ def cmd
41
+ Log.info("Launching Pig script in hadoop mode")
42
+ "pig #{pig_args(@options)} #{script}"
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,14 @@
1
+ module Swineherd::Script
2
+ class RScript
3
+ include Common
4
+
5
+ def local_cmd
6
+ "/usr/bin/Rscript --vanilla #{script}"
7
+ end
8
+
9
+ def cmd
10
+ local_cmd
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,31 @@
1
+ require 'pathname'
2
+
3
+ module Swineherd::Script
4
+ class WukongScript
5
+ include Common
6
+
7
+ def wukong_args options
8
+ options.map{|param,val| "--#{param}=#{val}" }.join(' ')
9
+ end
10
+
11
+ #
12
+ # Don't treat wukong scripts as templates
13
+ #
14
+ def script
15
+ @source
16
+ end
17
+
18
+ def cmd
19
+ raise "No wukong input specified" if input.empty?
20
+ Log.info("Launching Wukong script in hadoop mode")
21
+ "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
22
+ end
23
+
24
+ def local_cmd
25
+ inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
26
+ Log.info("Launching Wukong script in local mode")
27
+ "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,45 @@
1
+ require 'erubis'
2
+ require 'tempfile'
3
+
4
+
5
+ # Template.new(script_path, attributes).substitute!
6
+
7
+ module Swineherd
8
+
9
+ class Template
10
+ attr_accessor :source_template, :attributes
11
+
12
+ def initialize source_template, attributes
13
+ @source_template = source_template
14
+ @attributes = attributes
15
+ end
16
+
17
+ def compile!
18
+ dest << Erubis::Eruby.new(source).result(attributes)
19
+ dest << "\n"
20
+ dest
21
+ end
22
+
23
+ def substitute!
24
+ compile!
25
+ dest.read
26
+ dest.path
27
+ end
28
+
29
+ protected
30
+
31
+ def source
32
+ File.open(source_template).read
33
+ end
34
+
35
+ def dest
36
+ return @dest if @dest
37
+ @dest ||= Tempfile.new(basename)
38
+ end
39
+
40
+ def basename
41
+ File.basename(source_template)
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,53 @@
1
+ module Swineherd
2
+ class Workflow
3
+ attr_accessor :workdir, :outputs, :output_counts
4
+
5
+ #
6
+ # Create a new workflow and new namespace for this workflow
7
+ #
8
+ def initialize flow_id, &blk
9
+ @flow_id = flow_id
10
+ @output_counts = Hash.new{|h,k| h[k] = 0}
11
+ @outputs = Hash.new{|h,k| h[k] = []}
12
+ namespace @flow_id do
13
+ self.instance_eval(&blk)
14
+ end
15
+ end
16
+
17
+ #
18
+ # Get next logical output of taskname by incrementing internal counter
19
+ #
20
+ def next_output taskname
21
+ raise "No working directory specified." unless @workdir
22
+ @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
23
+ @output_counts[taskname] += 1
24
+ latest_output(taskname)
25
+ end
26
+
27
+ #
28
+ # Get latest output of taskname
29
+ #
30
+ def latest_output taskname
31
+ @outputs[taskname].last
32
+ end
33
+
34
+ #
35
+ # Runs workflow starting with taskname
36
+ #
37
+ def run taskname
38
+ Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
39
+ Rake::Task["#{@flow_id}:#{taskname}"].invoke
40
+ Log.info "Workflow task #{@flow_id}:#{taskname} finished"
41
+ end
42
+
43
+ #
44
+ # Describes the dependency tree of all tasks belonging to self
45
+ #
46
+ def describe
47
+ Rake::Task.tasks.each do |t|
48
+ Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,60 @@
1
+ module Swineherd
2
+
3
+ #
4
+ # Job class is at its core a rake task
5
+ #
6
+ class Job
7
+
8
+ #
9
+ # Initialize job, fill variables, and create rake task
10
+ #
11
+ def initialize job_id, &blk
12
+ @job_id = job_id
13
+ @name = ''
14
+ @dependencies = []
15
+ @script = ''
16
+ self.instance_eval(&blk)
17
+ raketask
18
+ handle_dependencies
19
+ end
20
+
21
+ #
22
+ # Will be the name of the rake task
23
+ #
24
+ def name name = nil
25
+ return @name unless name
26
+ @name = name
27
+ end
28
+
29
+ def script script = nil
30
+ return @script unless script
31
+ @script = script
32
+ end
33
+
34
+ #
35
+ # An array of job names as dependencies
36
+ #
37
+ def dependencies dependencies = nil
38
+ return @dependencies unless dependencies
39
+ @dependencies = dependencies
40
+ end
41
+
42
+ def handle_dependencies
43
+ return if dependencies.empty?
44
+ task name => dependencies
45
+ end
46
+
47
+ def cmd
48
+ @script.cmd
49
+ end
50
+
51
+ #
52
+ # Every job is compiled into a rake task
53
+ #
54
+ def raketask
55
+ task name do
56
+ @script.run
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,20 @@
1
+ Logging:
2
+
3
+ 1. All output from the launched workflow should go to a workflow log file
4
+ 2. Hadoop output is special and should be pulled down from the jobtracker
5
+ - jobconf.xml
6
+ - job details page
7
+
8
+ Workflow should specify a logdir, defualts to workdir + '/logs'
9
+
10
+ Fetching hadoop job stats:
11
+
12
+ 1. Get job id
13
+ 2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
14
+ 3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
15
+ 4. Fetch the two urls we care about and dump into the workflow's log dir.
16
+ 5. Possibly parse the results into an ongoing workflow-statistics.tsv file
17
+
18
+ Other output:
19
+
20
+ Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.
@@ -0,0 +1,97 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{swineherd}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jacob Perkins"]
12
+ s.date = %q{2011-04-20}
13
+ s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14
+ s.email = %q{jacob.a.perkins@gmail.com}
15
+ s.executables = ["hdp-tree", "hadoop-stream"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.textile"
19
+ ]
20
+ s.files = [
21
+ "LICENSE",
22
+ "README.textile",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "bin/hadoop-stream",
26
+ "bin/hdp-tree",
27
+ "examples/pagerank/data/seinfeld_network.tsv",
28
+ "examples/pagerank/pagerank.rb",
29
+ "examples/pagerank/scripts/cut_off_list.rb",
30
+ "examples/pagerank/scripts/histogram.R",
31
+ "examples/pagerank/scripts/pagerank.pig",
32
+ "examples/pagerank/scripts/pagerank_initialize.pig",
33
+ "lib/swineherd.rb",
34
+ "lib/swineherd/filesystem.rb",
35
+ "lib/swineherd/filesystem/README_filesystem.textile",
36
+ "lib/swineherd/filesystem/basefilesystem.rb",
37
+ "lib/swineherd/filesystem/filesystems.rb",
38
+ "lib/swineherd/filesystem/hadoopfilesystem.rb",
39
+ "lib/swineherd/filesystem/localfilesystem.rb",
40
+ "lib/swineherd/filesystem/localfs.rb",
41
+ "lib/swineherd/filesystem/s3filesystem.rb",
42
+ "lib/swineherd/script.rb",
43
+ "lib/swineherd/script/hadoop_script.rb",
44
+ "lib/swineherd/script/pig_script.rb",
45
+ "lib/swineherd/script/r_script.rb",
46
+ "lib/swineherd/script/wukong_script.rb",
47
+ "lib/swineherd/template.rb",
48
+ "lib/swineherd/workflow.rb",
49
+ "lib/swineherd/workflow/job.rb",
50
+ "notes.txt",
51
+ "swineherd.gemspec",
52
+ "tests/test_filesystem.rb",
53
+ "tests/test_s3_filesystem.rb",
54
+ "tests/testcfg.yaml"
55
+ ]
56
+ s.homepage = %q{http://github.com/Ganglion/swineherd}
57
+ s.licenses = ["MIT"]
58
+ s.require_paths = ["lib"]
59
+ s.rubygems_version = %q{1.3.7}
60
+ s.summary = %q{Flexible data workflow glue.}
61
+ s.test_files = [
62
+ "examples/pagerank/pagerank.rb",
63
+ "examples/pagerank/scripts/cut_off_list.rb"
64
+ ]
65
+
66
+ if s.respond_to? :specification_version then
67
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
+ s.specification_version = 3
69
+
70
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
71
+ s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
72
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
73
+ s.add_development_dependency(%q<rcov>, [">= 0"])
74
+ s.add_runtime_dependency(%q<configliere>, [">= 0"])
75
+ s.add_runtime_dependency(%q<gorillib>, [">= 0"])
76
+ s.add_runtime_dependency(%q<erubis>, [">= 0"])
77
+ s.add_runtime_dependency(%q<right_aws>, [">= 0"])
78
+ else
79
+ s.add_dependency(%q<yard>, ["~> 0.6.0"])
80
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
81
+ s.add_dependency(%q<rcov>, [">= 0"])
82
+ s.add_dependency(%q<configliere>, [">= 0"])
83
+ s.add_dependency(%q<gorillib>, [">= 0"])
84
+ s.add_dependency(%q<erubis>, [">= 0"])
85
+ s.add_dependency(%q<right_aws>, [">= 0"])
86
+ end
87
+ else
88
+ s.add_dependency(%q<yard>, ["~> 0.6.0"])
89
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
90
+ s.add_dependency(%q<rcov>, [">= 0"])
91
+ s.add_dependency(%q<configliere>, [">= 0"])
92
+ s.add_dependency(%q<gorillib>, [">= 0"])
93
+ s.add_dependency(%q<erubis>, [">= 0"])
94
+ s.add_dependency(%q<right_aws>, [">= 0"])
95
+ end
96
+ end
97
+