swineherd 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +201 -0
- data/README.textile +207 -0
- data/Rakefile +30 -0
- data/VERSION +1 -0
- data/bin/hadoop-stream +35 -0
- data/bin/hdp-tree +26 -0
- data/examples/pagerank/data/seinfeld_network.tsv +429 -0
- data/examples/pagerank/pagerank.rb +99 -0
- data/examples/pagerank/scripts/cut_off_list.rb +16 -0
- data/examples/pagerank/scripts/histogram.R +5 -0
- data/examples/pagerank/scripts/pagerank.pig +20 -0
- data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
- data/lib/swineherd.rb +11 -0
- data/lib/swineherd/filesystem.rb +26 -0
- data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
- data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
- data/lib/swineherd/filesystem/filesystems.rb +103 -0
- data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
- data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
- data/lib/swineherd/filesystem/localfs.rb +11 -0
- data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
- data/lib/swineherd/script.rb +74 -0
- data/lib/swineherd/script/hadoop_script.rb +59 -0
- data/lib/swineherd/script/pig_script.rb +46 -0
- data/lib/swineherd/script/r_script.rb +14 -0
- data/lib/swineherd/script/wukong_script.rb +31 -0
- data/lib/swineherd/template.rb +45 -0
- data/lib/swineherd/workflow.rb +53 -0
- data/lib/swineherd/workflow/job.rb +60 -0
- data/notes.txt +20 -0
- data/swineherd.gemspec +97 -0
- data/tests/test_filesystem.rb +105 -0
- data/tests/test_s3_filesystem.rb +132 -0
- data/tests/testcfg.yaml +7 -0
- metadata +204 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
module Swineherd
|
2
|
+
module Script
|
3
|
+
|
4
|
+
autoload :WukongScript, 'swineherd/script/wukong_script'
|
5
|
+
autoload :PigScript, 'swineherd/script/pig_script'
|
6
|
+
autoload :RScript, 'swineherd/script/r_script'
|
7
|
+
|
8
|
+
module Common
|
9
|
+
attr_accessor :input, :output, :options, :attributes
|
10
|
+
def initialize(source, input = [], output = [], options = {}, attributes ={})
|
11
|
+
@source = source
|
12
|
+
@input = input
|
13
|
+
@output = output
|
14
|
+
@options = options
|
15
|
+
@attributes = attributes
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Allows for setting the environment the script will be ran in
|
20
|
+
#
|
21
|
+
def env
|
22
|
+
ENV
|
23
|
+
end
|
24
|
+
|
25
|
+
def script
|
26
|
+
@script ||= Template.new(@source, @attributes).substitute!
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# So we can reuse ourselves
|
31
|
+
#
|
32
|
+
def refresh!
|
33
|
+
@script = nil
|
34
|
+
@output = []
|
35
|
+
@input = []
|
36
|
+
end
|
37
|
+
|
38
|
+
#
|
39
|
+
# This depends on the type of script
|
40
|
+
#
|
41
|
+
def cmd
|
42
|
+
raise "Override this in subclass!"
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Override this in subclass to decide how script runs in 'local' mode
|
47
|
+
# Best practice is that it needs to be able to run on a laptop w/o
|
48
|
+
# hadoop.
|
49
|
+
#
|
50
|
+
def local_cmd
|
51
|
+
raise "Override this in subclass!"
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Default is to run with hadoop
|
56
|
+
#
|
57
|
+
def run mode=:hadoop
|
58
|
+
case mode
|
59
|
+
when :local then
|
60
|
+
sh local_cmd do |res, ok|
|
61
|
+
Log.info("Exit status was #{ok}")
|
62
|
+
raise "Local mode script failed with exit status #{ok}" if ok != 0
|
63
|
+
end
|
64
|
+
when :hadoop then
|
65
|
+
sh cmd do |res, ok|
|
66
|
+
Log.info("Exit status was #{ok}")
|
67
|
+
raise "Hadoop mode script failed with exit status #{ok}" if ok != 0
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Swineherd::Script
|
2
|
+
|
3
|
+
#
|
4
|
+
# native Java map-reduce
|
5
|
+
#
|
6
|
+
class HadoopScript
|
7
|
+
include Common
|
8
|
+
attr_accessor :main_class, :run_jar, :java_options, :hadoop_classpath, :libjars
|
9
|
+
|
10
|
+
def initialize *args
|
11
|
+
super(*args)
|
12
|
+
@options = Hash.new{|h,k| h[k] = {}} # need to support nested options for this
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Converts an arbitrarily nested hash to flattened arguments
|
17
|
+
# for passing to java program. For example:
|
18
|
+
#
|
19
|
+
# {:mapred => {:reduce => {:tasks => 0}}}
|
20
|
+
#
|
21
|
+
# will transform to:
|
22
|
+
#
|
23
|
+
# '-Dmapred.reduce.tasks=0'
|
24
|
+
#
|
25
|
+
def java_args args
|
26
|
+
to_dotted_args(args).map{|arg| "-D#{arg}"}
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Uses recursion to take an arbitrarily nested hash and
|
31
|
+
# flatten it into dotted args. See 'to_java_args'. Can
|
32
|
+
# you do it any better?
|
33
|
+
#
|
34
|
+
def to_dotted_args args
|
35
|
+
args.map do |k,v|
|
36
|
+
if v.is_a?(Hash)
|
37
|
+
to_dotted_args(v).map do |s|
|
38
|
+
[k,s].join(".")
|
39
|
+
end
|
40
|
+
else
|
41
|
+
"#{k}=#{v}"
|
42
|
+
end
|
43
|
+
end.flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
def cmd
|
47
|
+
[
|
48
|
+
"HADOOP_CLASSPATH=#{hadoop_classpath}",
|
49
|
+
"#{hadoop_home}/bin/hadoop jar #{run_jar}",
|
50
|
+
main_class,
|
51
|
+
java_args(options),
|
52
|
+
"-libjars #{libjars}",
|
53
|
+
"#{input.join(',')}",
|
54
|
+
"#{output.join(',')}"
|
55
|
+
].flatten.compact.join(" \t\\\n ")
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Swineherd::Script
|
2
|
+
class PigScript
|
3
|
+
include Common
|
4
|
+
|
5
|
+
#
|
6
|
+
# Not guaranteeing anything.
|
7
|
+
#
|
8
|
+
AVRO_PIG_MAPPING = {
|
9
|
+
'string' => 'chararray',
|
10
|
+
'int' => 'int',
|
11
|
+
'long' => 'long',
|
12
|
+
'float' => 'float',
|
13
|
+
'double' => 'double',
|
14
|
+
'bytes' => 'bytearray',
|
15
|
+
'fixed' => 'bytearray'
|
16
|
+
}
|
17
|
+
|
18
|
+
#
|
19
|
+
# Simple utility function for mapping avro types to pig types
|
20
|
+
#
|
21
|
+
def self.avro_to_pig avro_type
|
22
|
+
AVRO_PIG_MAPPING[avro_type]
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Convert a generic hash of options {:foo => 'bar'} into
|
27
|
+
# command line options for pig '-p FOO=bar'
|
28
|
+
#
|
29
|
+
def pig_args options
|
30
|
+
options.map{|opt,val| "-p #{opt.to_s.upcase}=#{val}" }.join(' ')
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
def local_cmd
|
36
|
+
Log.info("Launching Pig script in local mode")
|
37
|
+
"pig -x local #{pig_args(@options)} #{script}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def cmd
|
41
|
+
Log.info("Launching Pig script in hadoop mode")
|
42
|
+
"pig #{pig_args(@options)} #{script}"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Swineherd::Script
|
4
|
+
class WukongScript
|
5
|
+
include Common
|
6
|
+
|
7
|
+
def wukong_args options
|
8
|
+
options.map{|param,val| "--#{param}=#{val}" }.join(' ')
|
9
|
+
end
|
10
|
+
|
11
|
+
#
|
12
|
+
# Don't treat wukong scripts as templates
|
13
|
+
#
|
14
|
+
def script
|
15
|
+
@source
|
16
|
+
end
|
17
|
+
|
18
|
+
def cmd
|
19
|
+
raise "No wukong input specified" if input.empty?
|
20
|
+
Log.info("Launching Wukong script in hadoop mode")
|
21
|
+
"ruby #{script} #{wukong_args(@options)} --run #{input.join(',')} #{output.join(',')}"
|
22
|
+
end
|
23
|
+
|
24
|
+
def local_cmd
|
25
|
+
inputs = input.map{|path| path += File.directory?(path) ? "/*" : ""}.join(',')
|
26
|
+
Log.info("Launching Wukong script in local mode")
|
27
|
+
"ruby #{script} #{wukong_args(@options)} --run=local #{inputs} #{output.join(',')}"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'erubis'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
|
5
|
+
# Template.new(script_path, attributes).substitute!
|
6
|
+
|
7
|
+
module Swineherd
|
8
|
+
|
9
|
+
class Template
|
10
|
+
attr_accessor :source_template, :attributes
|
11
|
+
|
12
|
+
def initialize source_template, attributes
|
13
|
+
@source_template = source_template
|
14
|
+
@attributes = attributes
|
15
|
+
end
|
16
|
+
|
17
|
+
def compile!
|
18
|
+
dest << Erubis::Eruby.new(source).result(attributes)
|
19
|
+
dest << "\n"
|
20
|
+
dest
|
21
|
+
end
|
22
|
+
|
23
|
+
def substitute!
|
24
|
+
compile!
|
25
|
+
dest.read
|
26
|
+
dest.path
|
27
|
+
end
|
28
|
+
|
29
|
+
protected
|
30
|
+
|
31
|
+
def source
|
32
|
+
File.open(source_template).read
|
33
|
+
end
|
34
|
+
|
35
|
+
def dest
|
36
|
+
return @dest if @dest
|
37
|
+
@dest ||= Tempfile.new(basename)
|
38
|
+
end
|
39
|
+
|
40
|
+
def basename
|
41
|
+
File.basename(source_template)
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Swineherd
|
2
|
+
class Workflow
|
3
|
+
attr_accessor :workdir, :outputs, :output_counts
|
4
|
+
|
5
|
+
#
|
6
|
+
# Create a new workflow and new namespace for this workflow
|
7
|
+
#
|
8
|
+
def initialize flow_id, &blk
|
9
|
+
@flow_id = flow_id
|
10
|
+
@output_counts = Hash.new{|h,k| h[k] = 0}
|
11
|
+
@outputs = Hash.new{|h,k| h[k] = []}
|
12
|
+
namespace @flow_id do
|
13
|
+
self.instance_eval(&blk)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Get next logical output of taskname by incrementing internal counter
|
19
|
+
#
|
20
|
+
def next_output taskname
|
21
|
+
raise "No working directory specified." unless @workdir
|
22
|
+
@outputs[taskname] << "#{@workdir}/#{@flow_id}/#{taskname}-#{@output_counts[taskname]}"
|
23
|
+
@output_counts[taskname] += 1
|
24
|
+
latest_output(taskname)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Get latest output of taskname
|
29
|
+
#
|
30
|
+
def latest_output taskname
|
31
|
+
@outputs[taskname].last
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Runs workflow starting with taskname
|
36
|
+
#
|
37
|
+
def run taskname
|
38
|
+
Log.info "Launching workflow task #{@flow_id}:#{taskname} ..."
|
39
|
+
Rake::Task["#{@flow_id}:#{taskname}"].invoke
|
40
|
+
Log.info "Workflow task #{@flow_id}:#{taskname} finished"
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Describes the dependency tree of all tasks belonging to self
|
45
|
+
#
|
46
|
+
def describe
|
47
|
+
Rake::Task.tasks.each do |t|
|
48
|
+
Log.info("Task: #{t.name} [#{t.inspect}]") if t.name =~ /#{@flow_id}/
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Swineherd
|
2
|
+
|
3
|
+
#
|
4
|
+
# Job class is at its core a rake task
|
5
|
+
#
|
6
|
+
class Job
|
7
|
+
|
8
|
+
#
|
9
|
+
# Initialize job, fill variables, and create rake task
|
10
|
+
#
|
11
|
+
def initialize job_id, &blk
|
12
|
+
@job_id = job_id
|
13
|
+
@name = ''
|
14
|
+
@dependencies = []
|
15
|
+
@script = ''
|
16
|
+
self.instance_eval(&blk)
|
17
|
+
raketask
|
18
|
+
handle_dependencies
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Will be the name of the rake task
|
23
|
+
#
|
24
|
+
def name name = nil
|
25
|
+
return @name unless name
|
26
|
+
@name = name
|
27
|
+
end
|
28
|
+
|
29
|
+
def script script = nil
|
30
|
+
return @script unless script
|
31
|
+
@script = script
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# An array of job names as dependencies
|
36
|
+
#
|
37
|
+
def dependencies dependencies = nil
|
38
|
+
return @dependencies unless dependencies
|
39
|
+
@dependencies = dependencies
|
40
|
+
end
|
41
|
+
|
42
|
+
def handle_dependencies
|
43
|
+
return if dependencies.empty?
|
44
|
+
task name => dependencies
|
45
|
+
end
|
46
|
+
|
47
|
+
def cmd
|
48
|
+
@script.cmd
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Every job is compiled into a rake task
|
53
|
+
#
|
54
|
+
def raketask
|
55
|
+
task name do
|
56
|
+
@script.run
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/notes.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Logging:
|
2
|
+
|
3
|
+
1. All output from the launched workflow should go to a workflow log file
|
4
|
+
2. Hadoop output is special and should be pulled down from the jobtracker
|
5
|
+
- jobconf.xml
|
6
|
+
- job details page
|
7
|
+
|
8
|
+
Workflow should specify a logdir, defualts to workdir + '/logs'
|
9
|
+
|
10
|
+
Fetching hadoop job stats:
|
11
|
+
|
12
|
+
1. Get job id
|
13
|
+
2. Use curl to fetch the latest logs listing: "http://jobtracker:50030/logs/history/"
|
14
|
+
3. Parse the logs listing and pull out the two urls we want (something-jobid.xml, something-jobid....)
|
15
|
+
4. Fetch the two urls we care about and dump into the workflow's log dir.
|
16
|
+
5. Possibly parse the results into an ongoing workflow-statistics.tsv file
|
17
|
+
|
18
|
+
Other output:
|
19
|
+
|
20
|
+
Output that would otherwise go to the terminal (nohup.out or some such) should be collected and dumped into the logdir as well.
|
data/swineherd.gemspec
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{swineherd}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Jacob Perkins"]
|
12
|
+
s.date = %q{2011-04-20}
|
13
|
+
s.description = %q{Swineherd is for running scripts and workflows on filesystems.}
|
14
|
+
s.email = %q{jacob.a.perkins@gmail.com}
|
15
|
+
s.executables = ["hdp-tree", "hadoop-stream"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE",
|
18
|
+
"README.textile"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"LICENSE",
|
22
|
+
"README.textile",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"bin/hadoop-stream",
|
26
|
+
"bin/hdp-tree",
|
27
|
+
"examples/pagerank/data/seinfeld_network.tsv",
|
28
|
+
"examples/pagerank/pagerank.rb",
|
29
|
+
"examples/pagerank/scripts/cut_off_list.rb",
|
30
|
+
"examples/pagerank/scripts/histogram.R",
|
31
|
+
"examples/pagerank/scripts/pagerank.pig",
|
32
|
+
"examples/pagerank/scripts/pagerank_initialize.pig",
|
33
|
+
"lib/swineherd.rb",
|
34
|
+
"lib/swineherd/filesystem.rb",
|
35
|
+
"lib/swineherd/filesystem/README_filesystem.textile",
|
36
|
+
"lib/swineherd/filesystem/basefilesystem.rb",
|
37
|
+
"lib/swineherd/filesystem/filesystems.rb",
|
38
|
+
"lib/swineherd/filesystem/hadoopfilesystem.rb",
|
39
|
+
"lib/swineherd/filesystem/localfilesystem.rb",
|
40
|
+
"lib/swineherd/filesystem/localfs.rb",
|
41
|
+
"lib/swineherd/filesystem/s3filesystem.rb",
|
42
|
+
"lib/swineherd/script.rb",
|
43
|
+
"lib/swineherd/script/hadoop_script.rb",
|
44
|
+
"lib/swineherd/script/pig_script.rb",
|
45
|
+
"lib/swineherd/script/r_script.rb",
|
46
|
+
"lib/swineherd/script/wukong_script.rb",
|
47
|
+
"lib/swineherd/template.rb",
|
48
|
+
"lib/swineherd/workflow.rb",
|
49
|
+
"lib/swineherd/workflow/job.rb",
|
50
|
+
"notes.txt",
|
51
|
+
"swineherd.gemspec",
|
52
|
+
"tests/test_filesystem.rb",
|
53
|
+
"tests/test_s3_filesystem.rb",
|
54
|
+
"tests/testcfg.yaml"
|
55
|
+
]
|
56
|
+
s.homepage = %q{http://github.com/Ganglion/swineherd}
|
57
|
+
s.licenses = ["MIT"]
|
58
|
+
s.require_paths = ["lib"]
|
59
|
+
s.rubygems_version = %q{1.3.7}
|
60
|
+
s.summary = %q{Flexible data workflow glue.}
|
61
|
+
s.test_files = [
|
62
|
+
"examples/pagerank/pagerank.rb",
|
63
|
+
"examples/pagerank/scripts/cut_off_list.rb"
|
64
|
+
]
|
65
|
+
|
66
|
+
if s.respond_to? :specification_version then
|
67
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
68
|
+
s.specification_version = 3
|
69
|
+
|
70
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
71
|
+
s.add_development_dependency(%q<yard>, ["~> 0.6.0"])
|
72
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
73
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
74
|
+
s.add_runtime_dependency(%q<configliere>, [">= 0"])
|
75
|
+
s.add_runtime_dependency(%q<gorillib>, [">= 0"])
|
76
|
+
s.add_runtime_dependency(%q<erubis>, [">= 0"])
|
77
|
+
s.add_runtime_dependency(%q<right_aws>, [">= 0"])
|
78
|
+
else
|
79
|
+
s.add_dependency(%q<yard>, ["~> 0.6.0"])
|
80
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
81
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
82
|
+
s.add_dependency(%q<configliere>, [">= 0"])
|
83
|
+
s.add_dependency(%q<gorillib>, [">= 0"])
|
84
|
+
s.add_dependency(%q<erubis>, [">= 0"])
|
85
|
+
s.add_dependency(%q<right_aws>, [">= 0"])
|
86
|
+
end
|
87
|
+
else
|
88
|
+
s.add_dependency(%q<yard>, ["~> 0.6.0"])
|
89
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
90
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
91
|
+
s.add_dependency(%q<configliere>, [">= 0"])
|
92
|
+
s.add_dependency(%q<gorillib>, [">= 0"])
|
93
|
+
s.add_dependency(%q<erubis>, [">= 0"])
|
94
|
+
s.add_dependency(%q<right_aws>, [">= 0"])
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|