swineherd 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +201 -0
- data/README.textile +207 -0
- data/Rakefile +30 -0
- data/VERSION +1 -0
- data/bin/hadoop-stream +35 -0
- data/bin/hdp-tree +26 -0
- data/examples/pagerank/data/seinfeld_network.tsv +429 -0
- data/examples/pagerank/pagerank.rb +99 -0
- data/examples/pagerank/scripts/cut_off_list.rb +16 -0
- data/examples/pagerank/scripts/histogram.R +5 -0
- data/examples/pagerank/scripts/pagerank.pig +20 -0
- data/examples/pagerank/scripts/pagerank_initialize.pig +24 -0
- data/lib/swineherd.rb +11 -0
- data/lib/swineherd/filesystem.rb +26 -0
- data/lib/swineherd/filesystem/README_filesystem.textile +47 -0
- data/lib/swineherd/filesystem/basefilesystem.rb +125 -0
- data/lib/swineherd/filesystem/filesystems.rb +103 -0
- data/lib/swineherd/filesystem/hadoopfilesystem.rb +263 -0
- data/lib/swineherd/filesystem/localfilesystem.rb +83 -0
- data/lib/swineherd/filesystem/localfs.rb +11 -0
- data/lib/swineherd/filesystem/s3filesystem.rb +249 -0
- data/lib/swineherd/script.rb +74 -0
- data/lib/swineherd/script/hadoop_script.rb +59 -0
- data/lib/swineherd/script/pig_script.rb +46 -0
- data/lib/swineherd/script/r_script.rb +14 -0
- data/lib/swineherd/script/wukong_script.rb +31 -0
- data/lib/swineherd/template.rb +45 -0
- data/lib/swineherd/workflow.rb +53 -0
- data/lib/swineherd/workflow/job.rb +60 -0
- data/notes.txt +20 -0
- data/swineherd.gemspec +97 -0
- data/tests/test_filesystem.rb +105 -0
- data/tests/test_s3_filesystem.rb +132 -0
- data/tests/testcfg.yaml +7 -0
- metadata +204 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH << '../../lib'
|
4
|
+
require 'swineherd' ; include Swineherd
|
5
|
+
require 'swineherd/script' ; include Swineherd::Script
|
6
|
+
require 'swineherd/filesystem'
|
7
|
+
|
8
|
+
Settings.define :flow_id, :required => true, :description => "Flow id required to make run of workflow unique"
|
9
|
+
Settings.define :iterations, :type => Integer, :default => 10, :description => "Number of pagerank iterations to run"
|
10
|
+
Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
|
11
|
+
Settings.resolve!
|
12
|
+
|
13
|
+
flow = Workflow.new(Settings.flow_id) do
|
14
|
+
|
15
|
+
# The filesystems we're going to be working with
|
16
|
+
hdfs = Swineherd::FileSystem.get(:hdfs)
|
17
|
+
localfs = Swineherd::FileSystem.get(:file)
|
18
|
+
|
19
|
+
# The scripts we're going to use
|
20
|
+
initializer = PigScript.new('scripts/pagerank_initialize.pig')
|
21
|
+
iterator = PigScript.new('scripts/pagerank.pig')
|
22
|
+
finisher = WukongScript.new('scripts/cut_off_list.rb')
|
23
|
+
plotter = RScript.new('scripts/histogram.R')
|
24
|
+
|
25
|
+
#
|
26
|
+
# Runs simple pig script to initialize pagerank. We must specify the input
|
27
|
+
# here as this is the first step in the workflow. The output attribute is to
|
28
|
+
# ensure idempotency and the options attribute is the hash that will be
|
29
|
+
# converted into command-line args for the pig interpreter.
|
30
|
+
#
|
31
|
+
task :pagerank_initialize do
|
32
|
+
initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
|
33
|
+
initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Runs multiple iterations of pagerank with another pig script and manages all
|
38
|
+
# the intermediate outputs.
|
39
|
+
#
|
40
|
+
task :pagerank_iterate => [:pagerank_initialize] do
|
41
|
+
iterator.options[:damp] = '0.85f'
|
42
|
+
iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
|
43
|
+
Settings.iterations.times do
|
44
|
+
iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
|
45
|
+
iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
|
46
|
+
iterator.refresh!
|
47
|
+
iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Here we use a wukong script to cut off the last field (a big pig bag of
|
53
|
+
# links). Notice how every wukong script MUST have an input but pig scripts do
|
54
|
+
# not.
|
55
|
+
#
|
56
|
+
task :cut_off_adjacency_list => [:pagerank_iterate] do
|
57
|
+
finisher.input << latest_output(:pagerank_iterate)
|
58
|
+
finisher.output << next_output(:cut_off_adjacency_list)
|
59
|
+
finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# We want to pull down one result file, merge the part-000.. files into one file
|
64
|
+
#
|
65
|
+
task :merge_results => [:cut_off_adjacency_list] do
|
66
|
+
merged_results = next_output(:merge_results)
|
67
|
+
hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Cat results into a local directory with the same structure
|
72
|
+
# eg. #{work_dir}/#{flow_id}/pull_down_results-0.
|
73
|
+
#
|
74
|
+
# FIXME: Bridging filesystems is cludgey.
|
75
|
+
#
|
76
|
+
task :pull_down_results => [:merge_results] do
|
77
|
+
local_results = next_output(:pull_down_results)
|
78
|
+
hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Plot 2nd column of the result as a histogram (requires R and
|
83
|
+
# ggplot2). Note that the output here is a png file but doesn't have that
|
84
|
+
# extension. Ensmarten me as to the right way to handle that?
|
85
|
+
#
|
86
|
+
task :plot_results => [:pull_down_results] do
|
87
|
+
plotter.attributes = {
|
88
|
+
:pagerank_data => latest_output(:pull_down_results),
|
89
|
+
:plot_file => next_output(:plot_results), # <-- this will be a png...
|
90
|
+
:raw_rank => "aes(x=d$V2)"
|
91
|
+
}
|
92
|
+
plotter.run(:hadoop) unless localfs.exists? latest_output(:plot_results)
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
flow.workdir = "/tmp/pagerank_example"
|
98
|
+
flow.describe
|
99
|
+
flow.run(:plot_results)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
#
|
7
|
+
# Does the very simple job of cutting of the giant adjacency list
|
8
|
+
#
|
9
|
+
class CutMapper < Wukong::Streamer::RecordStreamer
|
10
|
+
def process *args
|
11
|
+
node_a, node_b, list = args
|
12
|
+
yield [node_a, node_b]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
Wukong::Script.new(CutMapper, nil).run
|
@@ -0,0 +1,20 @@
|
|
1
|
+
--
|
2
|
+
-- Runs exactly one pagerank iteration
|
3
|
+
--
|
4
|
+
network = LOAD '$CURR_ITER_FILE' AS (node_a:chararray, rank:float, out_links:bag { link:tuple (node_b:chararray) });
|
5
|
+
sent_shares = FOREACH network GENERATE FLATTEN(out_links) AS node_b, (float)(rank / (float)SIZE(out_links)) AS share:float;
|
6
|
+
sent_links = FOREACH network GENERATE node_a, out_links;
|
7
|
+
rcvd_shares = COGROUP sent_links BY node_a INNER, sent_shares BY node_b;
|
8
|
+
next_iter = FOREACH rcvd_shares
|
9
|
+
{
|
10
|
+
raw_rank = (float)SUM(sent_shares.share);
|
11
|
+
-- treat the case that a node has no in links
|
12
|
+
damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*$DAMP + 1.0f - $DAMP : 0.0f);
|
13
|
+
GENERATE
|
14
|
+
group AS node_a,
|
15
|
+
damped_rank AS rank,
|
16
|
+
FLATTEN(sent_links.out_links) -- hack, should only be one bag, unbag it
|
17
|
+
;
|
18
|
+
};
|
19
|
+
|
20
|
+
STORE next_iter INTO '$NEXT_ITER_FILE';
|
@@ -0,0 +1,24 @@
|
|
1
|
+
--
|
2
|
+
-- Create initial graph on which to iterate the pagerank algorithm.
|
3
|
+
--
|
4
|
+
|
5
|
+
--
|
6
|
+
-- Generate a unique list of nodes with in links to cogroup on. This allows
|
7
|
+
-- us to treat the case where nodes have in links but no out links.
|
8
|
+
--
|
9
|
+
network = LOAD '$ADJLIST' AS (node_a:chararray, node_b:chararray);
|
10
|
+
cut_rhs = FOREACH network GENERATE node_b;
|
11
|
+
uniq_rhs = DISTINCT cut_rhs;
|
12
|
+
list_links = COGROUP network BY node_a, uniq_rhs BY node_b;
|
13
|
+
count_links = FOREACH list_links
|
14
|
+
{
|
15
|
+
-- if network.node_b is empty there are no out links, set to dummy value
|
16
|
+
out_links = (IsEmpty(network.node_b) ? {('dummy')} : network.node_b);
|
17
|
+
GENERATE
|
18
|
+
group AS node_a,
|
19
|
+
1.0f AS rank,
|
20
|
+
out_links AS out_links
|
21
|
+
;
|
22
|
+
};
|
23
|
+
|
24
|
+
STORE count_links INTO '$INITGRPH';
|
data/lib/swineherd.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
3
|
+
require 'rake'
|
4
|
+
require 'gorillib/logger/log'
|
5
|
+
|
6
|
+
module Swineherd
|
7
|
+
autoload :Template, 'swineherd/template'
|
8
|
+
autoload :FileSystem, 'swineherd/filesystem'
|
9
|
+
autoload :Script, 'swineherd/script'
|
10
|
+
autoload :Workflow, 'swineherd/workflow'
|
11
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Swineherd
|
2
|
+
autoload :BaseFileSystem, 'swineherd/filesystem/basefilesystem'
|
3
|
+
autoload :LocalFileSystem, 'swineherd/filesystem/localfilesystem'
|
4
|
+
autoload :HadoopFileSystem, 'swineherd/filesystem/hadoopfilesystem'
|
5
|
+
autoload :S3FileSystem, 'swineherd/filesystem/s3filesystem'
|
6
|
+
|
7
|
+
class FileSystem
|
8
|
+
|
9
|
+
FILESYSTEMS = {
|
10
|
+
'file' => Swineherd::LocalFileSystem,
|
11
|
+
'hdfs' => Swineherd::HadoopFileSystem,
|
12
|
+
's3' => Swineherd::S3FileSystem
|
13
|
+
}
|
14
|
+
|
15
|
+
# A factory function that returns an instance of the requested class
|
16
|
+
def self.get scheme, *args
|
17
|
+
begin
|
18
|
+
FILESYSTEMS[scheme.to_s].new *args
|
19
|
+
rescue NoMethodError => e
|
20
|
+
raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
h1. File System Abstraction
|
2
|
+
|
3
|
+
Hackboxen need to access files and directories in order to do their
|
4
|
+
stuff. We currently expect them to use at least the following types
|
5
|
+
of filesystems:
|
6
|
+
|
7
|
+
* Local File System
|
8
|
+
* Ephemeral Hadoop cluster HDFS
|
9
|
+
* s3/HDFS
|
10
|
+
|
11
|
+
Each of these filesystem types has different methods to accomplish the same operations. In order to make this diversity more easily used by hackboxen, an abstraction layer has been created.
|
12
|
+
|
13
|
+
h2. Interface
|
14
|
+
|
15
|
+
A new @FileSystem@ class has a single class method @get@ taking two argugments:
|
16
|
+
|
17
|
+
* @scheme@: A token which specifies the filesystem scheme. Currently, only @:file@ is supported.
|
18
|
+
* @*args@: Optional arguments (e.g. credentitals)
|
19
|
+
|
20
|
+
The returned (abstracted) filesystem instnace has the following methods:
|
21
|
+
|
22
|
+
* @open(path,mode,blk)@: Return a @File@ like file handle object. @mode@ and @blk@ arguments are optional and work like the standard ruby @File.open@ arguments.
|
23
|
+
* @rm(path)@: Works like UNIX @rm -r@.
|
24
|
+
* @exists?(path)@: Returns @true@ if the file/directory exists
|
25
|
+
* @mv(srcpath,dstpath)@: Renames/moves the file/directory.
|
26
|
+
* @cp(srcpath,dstpath)@: Works like UNIX @cp -r@.
|
27
|
+
* @mkpath(dirpath)@: Creates a directory and all required parent directories.
|
28
|
+
* @type(path)@: Returns one of "dir", "file", or "symlink".
|
29
|
+
* @entries(dirpath)@: Returns the the files/subdirectories in this directory
|
30
|
+
|
31
|
+
The @File@ object returned by the @open@ methods has the following methods:
|
32
|
+
|
33
|
+
* @read@: Return the contents of the entire file as a string.
|
34
|
+
* @readline@: Return the next line in the file, or nil if there no more lines.
|
35
|
+
* @write(string)@: Write @string@ to the file.
|
36
|
+
* @close@: Close the file
|
37
|
+
|
38
|
+
h2. Creating an abstraction
|
39
|
+
|
40
|
+
Each abstraction is not expected to catch and rethrow exceptions of the abstracted subsystems. Rather, exceptions should pass through. However, each method should try to be built to behave similarly to the corresponding native ruby @File@ and @FileUtils@ methods.
|
41
|
+
|
42
|
+
h2. Current State
|
43
|
+
|
44
|
+
The only currently implemented filesystem abstraction is @:file@ (local file system).
|
45
|
+
|
46
|
+
|
47
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module Swineherd
|
2
|
+
|
3
|
+
#
|
4
|
+
# All methods a filesystem should have
|
5
|
+
#
|
6
|
+
module BaseFileSystem
|
7
|
+
|
8
|
+
#
|
9
|
+
# Return a new instance of 'this' filesystem. Classes that include this
|
10
|
+
# module are expected to know how to pull their particular set of arguments
|
11
|
+
# from *args and initialize themselves by opening any required connections, &c.
|
12
|
+
#
|
13
|
+
def initialize *args
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Open a file in this filesystem. Should return a usable file handle for in
|
18
|
+
# the mode (read 'r' or 'w') given. File classes should, at minimum, have
|
19
|
+
# the methods defined in BaseFile
|
20
|
+
#
|
21
|
+
def open path, mode="r", &blk
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Recursively delete the path and all paths below it.
|
26
|
+
#
|
27
|
+
def rm path
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Returns true if the file or path exists and false otherwise.
|
32
|
+
#
|
33
|
+
def exists? path
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Moves the source path to the destination path
|
38
|
+
#
|
39
|
+
def mv srcpath, dstpath
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Recursively copies all files and directories under srcpath to dstpath
|
44
|
+
#
|
45
|
+
def cp srcpath, dstpath
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Make directory path if it does not (partly) exist
|
50
|
+
#
|
51
|
+
def mkpath path
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Return file type ("directory" or "file" or "symlink")
|
56
|
+
#
|
57
|
+
def type path
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Give contained files/dirs
|
62
|
+
#
|
63
|
+
def entries dirpath
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# For running tasks idempotently. Returns true if no paths exist, false if all paths exist,
|
68
|
+
# and raises an error otherwise.
|
69
|
+
#
|
70
|
+
def check_paths paths
|
71
|
+
exist_count = paths.inject(0){|cnt, path| cnt += 1 if exists?(path); cnt}
|
72
|
+
raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
|
73
|
+
return true if exist_count == 0
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# Needs to close the filesystem by cleaning up any open connections, &c.
|
79
|
+
#
|
80
|
+
def close *args
|
81
|
+
end
|
82
|
+
|
83
|
+
class BaseFile
|
84
|
+
attr_accessor :path, :scheme, :mode
|
85
|
+
|
86
|
+
|
87
|
+
def initialize *args, &blk
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# A new file in the filesystem needs to be instantiated with a
|
92
|
+
# path, a mode (read 'r' or write 'w').
|
93
|
+
#
|
94
|
+
def open path, mode="r", &blk
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Return whole file and as a string
|
99
|
+
#
|
100
|
+
def read
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Return a line from stream
|
105
|
+
#
|
106
|
+
def readline
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Writes a string to the file
|
111
|
+
#
|
112
|
+
def write string
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Close the file
|
117
|
+
#
|
118
|
+
def close *args
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
class FileSystem
|
4
|
+
|
5
|
+
# A factory function that returns an instance of the requested class
|
6
|
+
def self.get(scheme, *args)
|
7
|
+
if scheme == :file
|
8
|
+
LocalFileSystem.new()
|
9
|
+
else
|
10
|
+
nil
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class LocalFileSystem
|
15
|
+
|
16
|
+
# Open a file in this filesystem
|
17
|
+
def open(path,mode="r",&blk)
|
18
|
+
return LocalFile.new(path,mode,&blk)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Works like rm -r
|
22
|
+
def rm(path)
|
23
|
+
FileUtils.rm_r(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Does this exist?
|
27
|
+
def exists?(path)
|
28
|
+
File.exists?(path)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Works like UNIX mv
|
32
|
+
def mv(srcpath,dstpath)
|
33
|
+
FileUtils.mv(srcpath,dstpath)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Works like UNIX cp -r
|
37
|
+
def cp(srcpath,dstpath)
|
38
|
+
FileUtils.cp_r(srcpath,dstpath)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Make directory path if it does not (partly) exist
|
42
|
+
def mkpath(path)
|
43
|
+
FileUtils.mkpath
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return file type ("dir" or "file" or "symlink")
|
47
|
+
def type(path)
|
48
|
+
if File.symlink?(path)
|
49
|
+
return "symlink"
|
50
|
+
end
|
51
|
+
if File.directory?(path)
|
52
|
+
return "directory"
|
53
|
+
end
|
54
|
+
if File.file?(path)
|
55
|
+
return "file"
|
56
|
+
end
|
57
|
+
"unknown"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Give contained files/dirs
|
61
|
+
def entries(dirpath)
|
62
|
+
if type(dirpath) != "directory"
|
63
|
+
return nil
|
64
|
+
end
|
65
|
+
Dir.entries(dirpath)
|
66
|
+
end
|
67
|
+
|
68
|
+
class LocalFile
|
69
|
+
attr_accessor :path, :scheme, :mode
|
70
|
+
|
71
|
+
def initialize(path,mode="r",&blk)
|
72
|
+
@path=path
|
73
|
+
@mode=mode
|
74
|
+
@handle=File.open(path,mode,&blk)
|
75
|
+
end
|
76
|
+
|
77
|
+
def open(path,mode="r")
|
78
|
+
# Only "r" and "w" modes are supported.
|
79
|
+
initialize(path,mode)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return whole file and as a string
|
83
|
+
def read
|
84
|
+
@handle.read
|
85
|
+
end
|
86
|
+
|
87
|
+
# Return a line from stream
|
88
|
+
def readline
|
89
|
+
@handle.gets
|
90
|
+
end
|
91
|
+
|
92
|
+
# Writes to the file
|
93
|
+
def write(string)
|
94
|
+
@handle.write(string)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Close file
|
98
|
+
def close
|
99
|
+
@handle.close
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|