swineherd 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,74 @@
1
+ module Swineherd
2
+ module Script
3
+
4
+ autoload :WukongScript, 'swineherd/script/wukong_script'
5
+ autoload :PigScript, 'swineherd/script/pig_script'
6
+ autoload :RScript, 'swineherd/script/r_script'
7
+
8
+ module Common
9
+ attr_accessor :input, :output, :options, :attributes
10
+ def initialize(source, input = [], output = [], options = {}, attributes ={})
11
+ @source = source
12
+ @input = input
13
+ @output = output
14
+ @options = options
15
+ @attributes = attributes
16
+ end
17
+
18
+ #
19
+ # Allows for setting the environment the script will be ran in
20
+ #
21
+ def env
22
+ ENV
23
+ end
24
+
25
+ def script
26
+ @script ||= Template.new(@source, @attributes).substitute!
27
+ end
28
+
29
+ #
30
+ # So we can reuse ourselves
31
+ #
32
+ def refresh!
33
+ @script = nil
34
+ @output = []
35
+ @input = []
36
+ end
37
+
38
+ #
39
+ # This depends on the type of script
40
+ #
41
+ def cmd
42
+ raise "Override this in subclass!"
43
+ end
44
+
45
+ #
46
+ # Override this in subclass to decide how script runs in 'local' mode
47
+ # Best practice is that it needs to be able to run on a laptop w/o
48
+ # hadoop.
49
+ #
50
+ def local_cmd
51
+ raise "Override this in subclass!"
52
+ end
53
+
54
+ #
55
+ # Default is to run with hadoop
56
+ #
57
+ def run mode=:hadoop
58
+ case mode
59
+ when :local then
60
+ sh local_cmd do |res, ok|
61
+ Log.info("Exit status was #{ok}")
62
+ raise "Local mode script failed with exit status #{ok}" if ok != 0
63
+ end
64
+ when :hadoop then
65
+ sh cmd do |res, ok|
66
+ Log.info("Exit status was #{ok}")
67
+ raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
68
+ end
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,59 @@
1
+ module Swineherd::Script
2
+
3
+ #
4
+ # native Java map-reduce
5
+ #
6
+ class HadoopScript
7
+ include Common
8
+ attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
9
+
10
+ def initialize *args
11
+ super(*args)
12
+ @options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
13
+ end
14
+
15
+ #
16
+ # Converts an arbitrarily nested hash to flattened arguments
17
+ # for passing to java program. For example:
18
+ #
19
+ # {:mapred => {:reduce => {:tasks => 0}}}
20
+ #
21
+ # will transform to:
22
+ #
23
+ # '-Dmapred.reduce.tasks=0'
24
+ #
25
+ def java_args args
26
+ to_dotted_args(args).map{|arg| "-D#{arg}"}
27
+ end
28
+
29
+ #
30
+ # Uses recursion to take an arbitrarily nested hash and
31
+ # flatten it into dotted args. See 'to_java_args'. Can
32
+ # you do it any better?
33
+ #
34
+ def to_dotted_args args
35
+ args.map do |k,v|
36
+ if v.is_a?(Hash)
37
+ to_dotted_args(v).map do |s|
38
+ [k,s].join(".")
39
+ end
40
+ else
41
+ "#{k}=#{v}"
42
+ end
43
+ end.flatten
44
+ end
45
+
46
+ def cmd
47
+ [
48
+ "HADOOP_CLASSPATH=#{hadoop_classpath}",
49
+ "#{hadoop_home}/bin/hadoop jar #{run_jar}",
50
+ main_class,
51
+ java_args(options),
52
+ "-libjars #{libjars}",
53
+ "#{input.join(',')}",
54
+ "#{output.join(',')}"
55
+ ].flatten.compact.join(" \t\\\n ")
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,46 @@
1
+ module Swineherd::Script
2
+ class PigScript
3
+ include Common
4
+
5
+ #
6
+ # Not guaranteeing anything.
7
+ #
8
+ AVRO_PIG_MAPPING = {
9
+ 'string' => 'chararray',
10
+ 'int' => 'int',
11
+ 'long' => 'long',
12
+ 'float' => 'float',
13
+ 'double' => 'double',
14
+ 'bytes' => 'bytearray',
15
+ 'fixed' => 'bytearray'
16
+ }
17
+
18
+ #
19
+ # Simple utility function for mapping avro types to pig types
20
+ #
21
+ def self.avro_to_pig avro_type
22
+ AVRO_PIG_MAPPING[avro_type]
23
+ end
24
+
25
+ #
26
+ # Convert a generic hash of options {:foo => 'bar'} into
27
+ # command line options for pig '-p FOO=bar'
28
+ #
29
+ def pig_args options
30
+ options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
31
+ end
32
+
33
+
34
+
35
+ def local_cmd
36
+ Log.info("Launching Pig script in local mode")
37
+ "pig -x local #{pig_args(@options)} #{script}"
38
+ end
39
+
40
+ def cmd
41
+ Log.info("Launching Pig script in hadoop mode")
42
+ "pig #{pig_args(@options)} #{script}"
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,14 @@
1
+ module Swineherd::Script
2
+ class RScript
3
+ include Common
4
+
5
+ def local_cmd
6
+ "/usr/bin/Rscript --vanilla #{script}"
7
+ end
8
+
9
+ def cmd
10
+ local_cmd
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,31 @@
1
+ require 'pathname'
2
+
3
+ module Swineherd::Script
4
+ class WukongScript
5
+ include Common
6
+
7
+ def wukong_args options
8
+ options.map{|param,val| "--#{param}=#{val}" }.join(' ')
9
+ end
10
+
11
+ #
12
+ # Don't treat wukong scripts as templates
13
+ #
14
+ def script
15
+ @source
16
+ end
17
+
18
+ def cmd
19
+ raise "No wukong input specified" if input.empty?
20
+ Log.info("Launching Wukong script in hadoop mode")
21
+ "ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
22
+ end
23
+
24
+ def local_cmd
25
+ inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
26
+ Log.info("Launching Wukong script in local mode")
27
+ "ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,45 @@
1
+ require 'erubis'
2
+ require 'tempfile'
3
+
4
+
5
+ # Template.new(script_path, attributes).substitute!
6
+
7
+ module Swineherd
8
+
9
+ class Template
10
+ attr_accessor :source_template, :attributes
11
+
12
+ def initialize source_template, attributes
13
+ @source_template = source_template
14
+ @attributes = attributes
15
+ end
16
+
17
+ def compile!
18
+ dest << Erubis::Eruby.new(source).result(attributes)
19
+ dest << "\n"
20
+ dest
21
+ end
22
+
23
+ def substitute!
24
+ compile!
25
+ dest.read
26
+ dest.path
27
+ end
28
+
29
+ protected
30
+
31
+ def source
32
+ File.open(source_template).read
33
+ end
34
+
35
+ def dest
36
+ return @dest if @dest
37
+ @dest ||= Tempfile.new(basename)
38
+ end
39
+
40
+ def basename
41
+ File.basename(source_template)
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,53 @@
1
+ module Swineherd
2
+ class Workflow
3
+ attr_accessor :workdir, :outputs, :output_counts
4
+
5
+ #
6
+ # Create a new workflow and new namespace for this workflow
7
+ #
8
+ def initialize flow_id, &blk
9
+ @flow_id = flow_id
10
+ @output_counts = Hash.new{|h,k| h[k] = 0}
11
+ @outputs = Hash.new{|h,k| h[k] = []}
12
+ namespace @flow_id do
13
+ self.instance_eval(&blk)
14
+ end
15
+ end
16
+
17
+ #
18
+ # Get next logical output of taskname by incrementing internal counter
19
+ #
20
+ def next_output taskname
21
+ raise "No working directory specified." unless @workdir
22
+ @outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
23
+ @output_counts[taskname] += 1
24
+ latest_output(taskname)
25
+ end
26
+
27
+ #
28
+ # Get latest output of taskname
29
+ #
30
+ def latest_output taskname
31
+ @outputs[taskname].last
32
+ end
33
+
34
+ #
35
+ # Runs workflow starting with taskname
36
+ #
37
+ def run taskname
38
+ Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
39
+ Rake::Task["#{@flow_id}:#{taskname}"].invoke
40
+ Log.info "Workflow task #{@flow_id}:#{taskname} finished"
41
+ end
42
+
43
+ #
44
+ # Describes the dependency tree of all tasks belonging to self
45
+ #
46
+ def describe
47
+ Rake::Task.tasks.each do |t|
48
+ Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,60 @@
1
+ module Swineherd
2
+
3
+ #
4
+ # Job class is at its core a rake task
5
+ #
6
+ class Job
7
+
8
+ #
9
+ # Initialize job, fill variables, and create rake task
10
+ #
11
+ def initialize job_id, &blk
12
+ @job_id = job_id
13
+ @name = ''
14
+ @dependencies = []
15
+ @script = ''
16
+ self.instance_eval(&blk)
17
+ raketask
18
+ handle_dependencies
19
+ end
20
+
21
+ #
22
+ # Will be the name of the rake task
23
+ #
24
+ def name name = nil
25
+ return @name unless name
26
+ @name = name
27
+ end
28
+
29
+ def script script = nil
30
+ return @script unless script
31
+ @script = script
32
+ end
33
+
34
+ #
35
+ # An array of job names as dependencies
36
+ #
37
+ def dependencies dependencies = nil
38
+ return @dependencies unless dependencies
39
+ @dependencies = dependencies
40
+ end
41
+
42
+ def handle_dependencies
43
+ return if dependencies.empty?
44
+ task name => dependencies
45
+ end
46
+
47
+ def cmd
48
+ @script.cmd
49
+ end
50
+
51
+ #
52
+ # Every job is compiled into a rake task
53
+ #
54
+ def raketask
55
+ task name do
56
+ @script.run
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,20 @@
1
+ Logging:
2
+
3
+ 1. All output from the launched workflow should go to a workflow log file
4
+ 2. Hadoop output is special and should be pulled down from the jobtracker
5
+ - jobconf.xml
6
+ - job details page
7
+
8
+ Workflow should specify a logdir, defualts to workdir + '/logs'
9
+
10
+ Fetching hadoop job stats:
11
+
12
+ 1. Get job id
13
+ 2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
14
+ 3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
15
+ 4. Fetch the two urls we care about and dump into the workflow's log dir.
16
+ 5. Possibly parse the results into an ongoing workflow-statistics.tsv file
17
+
18
+ Other output:
19
+
20
+ Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.
@@ -0,0 +1,97 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{swineherd}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Jacob Perkins"]
12
+ s.date = %q{2011-04-20}
13
+ s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
14
+ s.email = %q{jacob.a.perkins@gmail.com}
15
+ s.executables = ["hdp-tree", "hadoop-stream"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.textile"
19
+ ]
20
+ s.files = [
21
+ "LICENSE",
22
+ "README.textile",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "bin/hadoop-stream",
26
+ "bin/hdp-tree",
27
+ "examples/pagerank/data/seinfeld_network.tsv",
28
+ "examples/pagerank/pagerank.rb",
29
+ "examples/pagerank/scripts/cut_off_list.rb",
30
+ "examples/pagerank/scripts/histogram.R",
31
+ "examples/pagerank/scripts/pagerank.pig",
32
+ "examples/pagerank/scripts/pagerank_initialize.pig",
33
+ "lib/swineherd.rb",
34
+ "lib/swineherd/filesystem.rb",
35
+ "lib/swineherd/filesystem/README_filesystem.textile",
36
+ "lib/swineherd/filesystem/basefilesystem.rb",
37
+ "lib/swineherd/filesystem/filesystems.rb",
38
+ "lib/swineherd/filesystem/hadoopfilesystem.rb",
39
+ "lib/swineherd/filesystem/localfilesystem.rb",
40
+ "lib/swineherd/filesystem/localfs.rb",
41
+ "lib/swineherd/filesystem/s3filesystem.rb",
42
+ "lib/swineherd/script.rb",
43
+ "lib/swineherd/script/hadoop_script.rb",
44
+ "lib/swineherd/script/pig_script.rb",
45
+ "lib/swineherd/script/r_script.rb",
46
+ "lib/swineherd/script/wukong_script.rb",
47
+ "lib/swineherd/template.rb",
48
+ "lib/swineherd/workflow.rb",
49
+ "lib/swineherd/workflow/job.rb",
50
+ "notes.txt",
51
+ "swineherd.gemspec",
52
+ "tests/test_filesystem.rb",
53
+ "tests/test_s3_filesystem.rb",
54
+ "tests/testcfg.yaml"
55
+ ]
56
+ s.homepage = %q{http://github.com/Ganglion/swineherd}
57
+ s.licenses = ["MIT"]
58
+ s.require_paths = ["lib"]
59
+ s.rubygems_version = %q{1.3.7}
60
+ s.summary = %q{Flexible data workflow glue.}
61
+ s.test_files = [
62
+ "examples/pagerank/pagerank.rb",
63
+ "examples/pagerank/scripts/cut_off_list.rb"
64
+ ]
65
+
66
+ if s.respond_to? :specification_version then
67
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
+ s.specification_version = 3
69
+
70
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
71
+ s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
72
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
73
+ s.add_development_dependency(%q<rcov>, [">= 0"])
74
+ s.add_runtime_dependency(%q<configliere>, [">= 0"])
75
+ s.add_runtime_dependency(%q<gorillib>, [">= 0"])
76
+ s.add_runtime_dependency(%q<erubis>, [">= 0"])
77
+ s.add_runtime_dependency(%q<right_aws>, [">= 0"])
78
+ else
79
+ s.add_dependency(%q<yard>, ["~> 0.6.0"])
80
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
81
+ s.add_dependency(%q<rcov>, [">= 0"])
82
+ s.add_dependency(%q<configliere>, [">= 0"])
83
+ s.add_dependency(%q<gorillib>, [">= 0"])
84
+ s.add_dependency(%q<erubis>, [">= 0"])
85
+ s.add_dependency(%q<right_aws>, [">= 0"])
86
+ end
87
+ else
88
+ s.add_dependency(%q<yard>, ["~> 0.6.0"])
89
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
90
+ s.add_dependency(%q<rcov>, [">= 0"])
91
+ s.add_dependency(%q<configliere>, [">= 0"])
92
+ s.add_dependency(%q<gorillib>, [">= 0"])
93
+ s.add_dependency(%q<erubis>, [">= 0"])
94
+ s.add_dependency(%q<right_aws>, [">= 0"])
95
+ end
96
+ end
97
+