swineherd 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << '../../lib'
4
+ require 'swineherd' ; include Swineherd
5
+ require 'swineherd/script' ; include Swineherd::Script
6
+ require 'swineherd/filesystem'
7
+
8
+ Settings.define :flow_id, :required => true, :description => "Flow id required to make run of workflow unique"
9
+ Settings.define :iterations, :type => Integer, :default => 10, :description => "Number of pagerank iterations to run"
10
+ Settings.define :hadoop_home, :default => '/usr/local/share/hadoop', :description => "Path to hadoop config"
11
+ Settings.resolve!
12
+
13
+ flow = Workflow.new(Settings.flow_id) do
14
+
15
+ # The filesystems we're going to be working with
16
+ hdfs = Swineherd::FileSystem.get(:hdfs)
17
+ localfs = Swineherd::FileSystem.get(:file)
18
+
19
+ # The scripts we're going to use
20
+ initializer = PigScript.new('scripts/pagerank_initialize.pig')
21
+ iterator = PigScript.new('scripts/pagerank.pig')
22
+ finisher = WukongScript.new('scripts/cut_off_list.rb')
23
+ plotter = RScript.new('scripts/histogram.R')
24
+
25
+ #
26
+ # Runs simple pig script to initialize pagerank. We must specify the input
27
+ # here as this is the first step in the workflow. The output attribute is to
28
+ # ensure idempotency and the options attribute is the hash that will be
29
+ # converted into command-line args for the pig interpreter.
30
+ #
31
+ task :pagerank_initialize do
32
+ initializer.options = {:adjlist => "/tmp/pagerank_example/seinfeld_network.tsv", :initgrph => next_output(:pagerank_initialize)}
33
+ initializer.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_initialize)
34
+ end
35
+
36
+ #
37
+ # Runs multiple iterations of pagerank with another pig script and manages all
38
+ # the intermediate outputs.
39
+ #
40
+ task :pagerank_iterate => [:pagerank_initialize] do
41
+ iterator.options[:damp] = '0.85f'
42
+ iterator.options[:curr_iter_file] = latest_output(:pagerank_initialize)
43
+ Settings.iterations.times do
44
+ iterator.options[:next_iter_file] = next_output(:pagerank_iterate)
45
+ iterator.run(:hadoop) unless hdfs.exists? latest_output(:pagerank_iterate)
46
+ iterator.refresh!
47
+ iterator.options[:curr_iter_file] = latest_output(:pagerank_iterate)
48
+ end
49
+ end
50
+
51
+ #
52
+ # Here we use a wukong script to cut off the last field (a big pig bag of
53
+ # links). Notice how every wukong script MUST have an input but pig scripts do
54
+ # not.
55
+ #
56
+ task :cut_off_adjacency_list => [:pagerank_iterate] do
57
+ finisher.input << latest_output(:pagerank_iterate)
58
+ finisher.output << next_output(:cut_off_adjacency_list)
59
+ finisher.run :hadoop unless hdfs.exists? latest_output(:cut_off_adjacency_list)
60
+ end
61
+
62
+ #
63
+ # We want to pull down one result file, merge the part-000.. files into one file
64
+ #
65
+ task :merge_results => [:cut_off_adjacency_list] do
66
+ merged_results = next_output(:merge_results)
67
+ hdfs.merge(latest_output(:cut_off_adjacency_list), merged_results) unless hdfs.exists? merged_results
68
+ end
69
+
70
+ #
71
+ # Cat results into a local directory with the same structure
72
+ # eg. #{work_dir}/#{flow_id}/pull_down_results-0.
73
+ #
74
+ # FIXME: Bridging filesystems is cludgey.
75
+ #
76
+ task :pull_down_results => [:merge_results] do
77
+ local_results = next_output(:pull_down_results)
78
+ hdfs.copy_to_local(latest_output(:merge_results), local_results) unless localfs.exists? local_results
79
+ end
80
+
81
+ #
82
+ # Plot 2nd column of the result as a histogram (requires R and
83
+ # ggplot2). Note that the output here is a png file but doesn't have that
84
+ # extension. Ensmarten me as to the right way to handle that?
85
+ #
86
+ task :plot_results => [:pull_down_results] do
87
+ plotter.attributes = {
88
+ :pagerank_data => latest_output(:pull_down_results),
89
+ :plot_file => next_output(:plot_results), # <-- this will be a png...
90
+ :raw_rank => "aes(x=d$V2)"
91
+ }
92
+ plotter.run(:hadoop) unless localfs.exists? latest_output(:plot_results)
93
+ end
94
+
95
+ end
96
+
97
+ flow.workdir = "/tmp/pagerank_example"
98
+ flow.describe
99
+ flow.run(:plot_results)
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'wukong'
5
+
6
+ #
7
+ # Does the very simple job of cutting of the giant adjacency list
8
+ #
9
+ class CutMapper < Wukong::Streamer::RecordStreamer
10
+ def process *args
11
+ node_a, node_b, list = args
12
+ yield [node_a, node_b]
13
+ end
14
+ end
15
+
16
+ Wukong::Script.new(CutMapper, nil).run
@@ -0,0 +1,5 @@
1
+ library(ggplot2);
2
+ png('<%= plot_file %>', width=900, res=132);
3
+ d <- read.table('<%= pagerank_data %>', header=FALSE, sep='\t');
4
+ p <- ggplot(d, <%= raw_rank %>) + geom_histogram() + xlab("") + ylab("");
5
+ p;
@@ -0,0 +1,20 @@
1
+ --
2
+ -- Runs exactly one pagerank iteration
3
+ --
4
+ network = LOAD '$CURR_ITER_FILE' AS (node_a:chararray, rank:float, out_links:bag { link:tuple (node_b:chararray) });
5
+ sent_shares = FOREACH network GENERATE FLATTEN(out_links) AS node_b, (float)(rank / (float)SIZE(out_links)) AS share:float;
6
+ sent_links = FOREACH network GENERATE node_a, out_links;
7
+ rcvd_shares = COGROUP sent_links BY node_a INNER, sent_shares BY node_b;
8
+ next_iter = FOREACH rcvd_shares
9
+ {
10
+ raw_rank = (float)SUM(sent_shares.share);
11
+ -- treat the case that a node has no in links
12
+ damped_rank = ((raw_rank IS NOT NULL AND raw_rank > 1.0e-12f) ? raw_rank*$DAMP + 1.0f - $DAMP : 0.0f);
13
+ GENERATE
14
+ group AS node_a,
15
+ damped_rank AS rank,
16
+ FLATTEN(sent_links.out_links) -- hack, should only be one bag, unbag it
17
+ ;
18
+ };
19
+
20
+ STORE next_iter INTO '$NEXT_ITER_FILE';
@@ -0,0 +1,24 @@
1
+ --
2
+ -- Create initial graph on which to iterate the pagerank algorithm.
3
+ --
4
+
5
+ --
6
+ -- Generate a unique list of nodes with in links to cogroup on. This allows
7
+ -- us to treat the case where nodes have in links but no out links.
8
+ --
9
+ network = LOAD '$ADJLIST' AS (node_a:chararray, node_b:chararray);
10
+ cut_rhs = FOREACH network GENERATE node_b;
11
+ uniq_rhs = DISTINCT cut_rhs;
12
+ list_links = COGROUP network BY node_a, uniq_rhs BY node_b;
13
+ count_links = FOREACH list_links
14
+ {
15
+ -- if network.node_b is empty there are no out links, set to dummy value
16
+ out_links = (IsEmpty(network.node_b) ? {('dummy')} : network.node_b);
17
+ GENERATE
18
+ group AS node_a,
19
+ 1.0f AS rank,
20
+ out_links AS out_links
21
+ ;
22
+ };
23
+
24
+ STORE count_links INTO '$INITGRPH';
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
3
+ require 'rake'
4
+ require 'gorillib/logger/log'
5
+
6
+ module Swineherd
7
+ autoload :Template, 'swineherd/template'
8
+ autoload :FileSystem, 'swineherd/filesystem'
9
+ autoload :Script, 'swineherd/script'
10
+ autoload :Workflow, 'swineherd/workflow'
11
+ end
@@ -0,0 +1,26 @@
1
+ module Swineherd
2
+ autoload :BaseFileSystem, 'swineherd/filesystem/basefilesystem'
3
+ autoload :LocalFileSystem, 'swineherd/filesystem/localfilesystem'
4
+ autoload :HadoopFileSystem, 'swineherd/filesystem/hadoopfilesystem'
5
+ autoload :S3FileSystem, 'swineherd/filesystem/s3filesystem'
6
+
7
+ class FileSystem
8
+
9
+ FILESYSTEMS = {
10
+ 'file' => Swineherd::LocalFileSystem,
11
+ 'hdfs' => Swineherd::HadoopFileSystem,
12
+ 's3' => Swineherd::S3FileSystem
13
+ }
14
+
15
+ # A factory function that returns an instance of the requested class
16
+ def self.get scheme, *args
17
+ begin
18
+ FILESYSTEMS[scheme.to_s].new *args
19
+ rescue NoMethodError => e
20
+ raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}"
21
+ end
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,47 @@
1
+ h1. File System Abstraction
2
+
3
+ Hackboxen need to access files and directories in order to do their
4
+ stuff. We currently expect them to use at least the following types
5
+ of filesystems:
6
+
7
+ * Local File System
8
+ * Ephemeral Hadoop cluster HDFS
9
+ * s3/HDFS
10
+
11
+ Each of these filesystem types has different methods to accomplish the same operations. In order to make this diversity more easily used by hackboxen, an abstraction layer has been created.
12
+
13
+ h2. Interface
14
+
15
+ A new @FileSystem@ class has a single class method @get@ taking two argugments:
16
+
17
+ * @scheme@: A token which specifies the filesystem scheme. Currently, only @:file@ is supported.
18
+ * @*args@: Optional arguments (e.g. credentitals)
19
+
20
+ The returned (abstracted) filesystem instnace has the following methods:
21
+
22
+ * @open(path,mode,blk)@: Return a @File@ like file handle object. @mode@ and @blk@ arguments are optional and work like the standard ruby @File.open@ arguments.
23
+ * @rm(path)@: Works like UNIX @rm -r@.
24
+ * @exists?(path)@: Returns @true@ if the file/directory exists
25
+ * @mv(srcpath,dstpath)@: Renames/moves the file/directory.
26
+ * @cp(srcpath,dstpath)@: Works like UNIX @cp -r@.
27
+ * @mkpath(dirpath)@: Creates a directory and all required parent directories.
28
+ * @type(path)@: Returns one of "dir", "file", or "symlink".
29
+ * @entries(dirpath)@: Returns the the files/subdirectories in this directory
30
+
31
+ The @File@ object returned by the @open@ methods has the following methods:
32
+
33
+ * @read@: Return the contents of the entire file as a string.
34
+ * @readline@: Return the next line in the file, or nil if there no more lines.
35
+ * @write(string)@: Write @string@ to the file.
36
+ * @close@: Close the file
37
+
38
+ h2. Creating an abstraction
39
+
40
+ Each abstraction is not expected to catch and rethrow exceptions of the abstracted subsystems. Rather, exceptions should pass through. However, each method should try to be built to behave similarly to the corresponding native ruby @File@ and @FileUtils@ methods.
41
+
42
+ h2. Current State
43
+
44
+ The only currently implemented filesystem abstraction is @:file@ (local file system).
45
+
46
+
47
+
@@ -0,0 +1,125 @@
1
+ module Swineherd
2
+
3
+ #
4
+ # All methods a filesystem should have
5
+ #
6
+ module BaseFileSystem
7
+
8
+ #
9
+ # Return a new instance of 'this' filesystem. Classes that include this
10
+ # module are expected to know how to pull their particular set of arguments
11
+ # from *args and initialize themselves by opening any required connections, &c.
12
+ #
13
+ def initialize *args
14
+ end
15
+
16
+ #
17
+ # Open a file in this filesystem. Should return a usable file handle for in
18
+ # the mode (read 'r' or 'w') given. File classes should, at minimum, have
19
+ # the methods defined in BaseFile
20
+ #
21
+ def open path, mode="r", &blk
22
+ end
23
+
24
+ #
25
+ # Recursively delete the path and all paths below it.
26
+ #
27
+ def rm path
28
+ end
29
+
30
+ #
31
+ # Returns true if the file or path exists and false otherwise.
32
+ #
33
+ def exists? path
34
+ end
35
+
36
+ #
37
+ # Moves the source path to the destination path
38
+ #
39
+ def mv srcpath, dstpath
40
+ end
41
+
42
+ #
43
+ # Recursively copies all files and directories under srcpath to dstpath
44
+ #
45
+ def cp srcpath, dstpath
46
+ end
47
+
48
+ #
49
+ # Make directory path if it does not (partly) exist
50
+ #
51
+ def mkpath path
52
+ end
53
+
54
+ #
55
+ # Return file type ("directory" or "file" or "symlink")
56
+ #
57
+ def type path
58
+ end
59
+
60
+ #
61
+ # Give contained files/dirs
62
+ #
63
+ def entries dirpath
64
+ end
65
+
66
+ #
67
+ # For running tasks idempotently. Returns true if no paths exist, false if all paths exist,
68
+ # and raises an error otherwise.
69
+ #
70
+ def check_paths paths
71
+ exist_count = paths.inject(0){|cnt, path| cnt += 1 if exists?(path); cnt}
72
+ raise "Indeterminate output state" if (exist_count > 0) && (exist_count < paths.size)
73
+ return true if exist_count == 0
74
+ false
75
+ end
76
+
77
+ #
78
+ # Needs to close the filesystem by cleaning up any open connections, &c.
79
+ #
80
+ def close *args
81
+ end
82
+
83
+ class BaseFile
84
+ attr_accessor :path, :scheme, :mode
85
+
86
+
87
+ def initialize *args, &blk
88
+ end
89
+
90
+ #
91
+ # A new file in the filesystem needs to be instantiated with a
92
+ # path, a mode (read 'r' or write 'w').
93
+ #
94
+ def open path, mode="r", &blk
95
+ end
96
+
97
+ #
98
+ # Return whole file and as a string
99
+ #
100
+ def read
101
+ end
102
+
103
+ #
104
+ # Return a line from stream
105
+ #
106
+ def readline
107
+ end
108
+
109
+ #
110
+ # Writes a string to the file
111
+ #
112
+ def write string
113
+ end
114
+
115
+ #
116
+ # Close the file
117
+ #
118
+ def close *args
119
+ end
120
+
121
+ end
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,103 @@
1
+ require 'fileutils'
2
+
3
+ class FileSystem
4
+
5
+ # A factory function that returns an instance of the requested class
6
+ def self.get(scheme, *args)
7
+ if scheme == :file
8
+ LocalFileSystem.new()
9
+ else
10
+ nil
11
+ end
12
+ end
13
+
14
+ class LocalFileSystem
15
+
16
+ # Open a file in this filesystem
17
+ def open(path,mode="r",&blk)
18
+ return LocalFile.new(path,mode,&blk)
19
+ end
20
+
21
+ # Works like rm -r
22
+ def rm(path)
23
+ FileUtils.rm_r(path)
24
+ end
25
+
26
+ # Does this exist?
27
+ def exists?(path)
28
+ File.exists?(path)
29
+ end
30
+
31
+ # Works like UNIX mv
32
+ def mv(srcpath,dstpath)
33
+ FileUtils.mv(srcpath,dstpath)
34
+ end
35
+
36
+ # Works like UNIX cp -r
37
+ def cp(srcpath,dstpath)
38
+ FileUtils.cp_r(srcpath,dstpath)
39
+ end
40
+
41
+ # Make directory path if it does not (partly) exist
42
+ def mkpath(path)
43
+ FileUtils.mkpath
44
+ end
45
+
46
+ # Return file type ("dir" or "file" or "symlink")
47
+ def type(path)
48
+ if File.symlink?(path)
49
+ return "symlink"
50
+ end
51
+ if File.directory?(path)
52
+ return "directory"
53
+ end
54
+ if File.file?(path)
55
+ return "file"
56
+ end
57
+ "unknown"
58
+ end
59
+
60
+ # Give contained files/dirs
61
+ def entries(dirpath)
62
+ if type(dirpath) != "directory"
63
+ return nil
64
+ end
65
+ Dir.entries(dirpath)
66
+ end
67
+
68
+ class LocalFile
69
+ attr_accessor :path, :scheme, :mode
70
+
71
+ def initialize(path,mode="r",&blk)
72
+ @path=path
73
+ @mode=mode
74
+ @handle=File.open(path,mode,&blk)
75
+ end
76
+
77
+ def open(path,mode="r")
78
+ # Only "r" and "w" modes are supported.
79
+ initialize(path,mode)
80
+ end
81
+
82
+ # Return whole file and as a string
83
+ def read
84
+ @handle.read
85
+ end
86
+
87
+ # Return a line from stream
88
+ def readline
89
+ @handle.gets
90
+ end
91
+
92
+ # Writes to the file
93
+ def write(string)
94
+ @handle.write(string)
95
+ end
96
+
97
+ # Close file
98
+ def close
99
+ @handle.close
100
+ end
101
+ end
102
+ end
103
+ end