wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'wukong/extensions/class'
|
2
|
+
module Wukong
|
3
|
+
|
4
|
+
module HashlikeClass
|
5
|
+
module ClassMethods
|
6
|
+
def has_members *members
|
7
|
+
self.members ||= []
|
8
|
+
self.members = members.map(&:to_s) + self.members
|
9
|
+
self.members.each do |member|
|
10
|
+
attr_accessor member.to_sym
|
11
|
+
end
|
12
|
+
end
|
13
|
+
alias_method :has_member, :has_members
|
14
|
+
def keys
|
15
|
+
members
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def [](key)
|
20
|
+
self.send(key)
|
21
|
+
end
|
22
|
+
|
23
|
+
def []=(key, val)
|
24
|
+
self.send("#{key}=", val)
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_a
|
28
|
+
values_of(*members)
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_flat
|
32
|
+
to_a.map(&:to_flat).flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.included base
|
36
|
+
base.class_eval do
|
37
|
+
extend ClassMethods
|
38
|
+
include HashLike
|
39
|
+
class_inheritable_accessor :members
|
40
|
+
|
41
|
+
def to_hash *args
|
42
|
+
super(*args).merge 'type' => self.class.to_s
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
class Pathname
|
3
|
+
# Append path segments and expand to absolute path
|
4
|
+
#
|
5
|
+
# file = Pathname(Dir.pwd) / "subdir1" / :subdir2 / "filename.ext"
|
6
|
+
#
|
7
|
+
# @param [Pathname, String, #to_s] path path segment to concatenate with receiver
|
8
|
+
#
|
9
|
+
# @return [Pathname]
|
10
|
+
# receiver with _path_ appended and expanded to an absolute path
|
11
|
+
#
|
12
|
+
# @api public
|
13
|
+
def /(path)
|
14
|
+
(self + path).expand_path
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.[](*vals)
|
18
|
+
new( File.join(vals) )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Subdir < Pathname
|
23
|
+
def self.[](*vals)
|
24
|
+
dir = File.dirname(vals.shift)
|
25
|
+
new(File.join(dir, *vals))
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#
|
2
|
+
# String Monkeypatched for processing with wukong: see wukong/extensions/string
|
3
|
+
#
|
4
|
+
String.class_eval do
|
5
|
+
# By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
|
6
|
+
# is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
|
7
|
+
#
|
8
|
+
# +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
|
9
|
+
#
|
10
|
+
# Examples:
|
11
|
+
# "active_record".camelize # => "ActiveRecord"
|
12
|
+
# "active_record".camelize(:lower) # => "activeRecord"
|
13
|
+
# "active_record/errors".camelize # => "ActiveRecord::Errors"
|
14
|
+
# "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
|
15
|
+
def camelize(first_letter_in_uppercase = true)
|
16
|
+
if first_letter_in_uppercase
|
17
|
+
self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
18
|
+
else
|
19
|
+
self.first + camelize(self)[1..-1]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# The reverse of +camelize+. Makes an underscored, lowercase form from the expression in the string.
|
25
|
+
#
|
26
|
+
# Changes '::' to '/' to convert namespaces to paths.
|
27
|
+
#
|
28
|
+
# Examples:
|
29
|
+
# "ActiveRecord".underscore # => "active_record"
|
30
|
+
# "ActiveRecord::Errors".underscore # => active_record/errors
|
31
|
+
#
|
32
|
+
# Stolen from active_support
|
33
|
+
#
|
34
|
+
def underscore
|
35
|
+
gsub(/::/, '/').
|
36
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
37
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
38
|
+
tr("-", "_").
|
39
|
+
downcase
|
40
|
+
end
|
41
|
+
|
42
|
+
# Tries to find a constant with the name specified in the argument string:
|
43
|
+
#
|
44
|
+
# "Module".constantize # => Module
|
45
|
+
# "Test::Unit".constantize # => Test::Unit
|
46
|
+
#
|
47
|
+
# The name is assumed to be the one of a top-level constant, no matter whether
|
48
|
+
# it starts with "::" or not. No lexical context is taken into account:
|
49
|
+
#
|
50
|
+
# C = 'outside'
|
51
|
+
# module M
|
52
|
+
# C = 'inside'
|
53
|
+
# C # => 'inside'
|
54
|
+
# "C".constantize # => 'outside', same as ::C
|
55
|
+
# end
|
56
|
+
#
|
57
|
+
# NameError is raised when the name is not in CamelCase or the constant is
|
58
|
+
# unknown.
|
59
|
+
def constantize
|
60
|
+
unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ self
|
61
|
+
raise NameError, "#{self.inspect} is not a valid constant name!"
|
62
|
+
end
|
63
|
+
Object.module_eval("::#{$1}", __FILE__, __LINE__)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'wukong/extensions/hash'
|
2
|
+
require 'wukong/extensions/hash_like'
|
3
|
+
require 'wukong/extensions/symbol'
|
4
|
+
|
5
|
+
#
|
6
|
+
# extensions/struct
|
7
|
+
#
|
8
|
+
# Add several methods to make a struct duck-type much more like a Hash
|
9
|
+
#
|
10
|
+
Struct.class_eval do
|
11
|
+
include Wukong::HashLike
|
12
|
+
def self.keys
|
13
|
+
members
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
#
|
2
|
+
# h2. extensions/symbol.rb -- extensions to symbol class
|
3
|
+
#
|
4
|
+
class Symbol
|
5
|
+
#
|
6
|
+
# Turn the symbol into a simple proc (stolen from
|
7
|
+
# <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
|
8
|
+
def to_proc
|
9
|
+
Proc.new { |*args| args.shift.__send__(self, *args) }
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Wukong
|
2
|
+
# Common logger
|
3
|
+
#
|
4
|
+
# Set your own at any time with
|
5
|
+
# Wukong.logger = YourAwesomeLogger.new(...)
|
6
|
+
# If you have log4r installed you can use
|
7
|
+
# Wukong.logger = Wukong.default_log4r_logger
|
8
|
+
#
|
9
|
+
# If Wukong.logger is too much typing for you,
|
10
|
+
# use the Log constant
|
11
|
+
#
|
12
|
+
# Default format:
|
13
|
+
# I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
|
14
|
+
#
|
15
|
+
def self.logger
|
16
|
+
@logger ||= default_ruby_logger
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
|
21
|
+
# friendly) output lines
|
22
|
+
#
|
23
|
+
def self.default_log4r_logger logger_handle='wukong'
|
24
|
+
require 'log4r'
|
25
|
+
lgr = Log4r::Logger.new logger_handle
|
26
|
+
outputter = Log4r::Outputter.stderr
|
27
|
+
# Define timestamp formatter method
|
28
|
+
::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
|
29
|
+
# 2009-07-25T00:12:05Z INFO PID\t
|
30
|
+
outputter.formatter = Log4r::PatternFormatter.new(
|
31
|
+
:pattern => "%d %.4l #{Process.pid}\t%.2000m",
|
32
|
+
:date_method => :utc_iso8601
|
33
|
+
)
|
34
|
+
lgr.outputters = outputter
|
35
|
+
lgr
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.default_ruby_logger
|
39
|
+
require 'logger'
|
40
|
+
Logger.new STDERR
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.logger= logger
|
44
|
+
@logger = logger
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# A convenient logger.
|
50
|
+
#
|
51
|
+
# Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
|
52
|
+
#
|
53
|
+
Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Wukong
|
3
|
+
module Models
|
4
|
+
class Edge < TypedStruct.new(
|
5
|
+
[:src, Integer],
|
6
|
+
[:dest, Integer]
|
7
|
+
)
|
8
|
+
end
|
9
|
+
|
10
|
+
class MultiEdge < TypedStruct.new(
|
11
|
+
[:src, Integer],
|
12
|
+
[:dest, Integer],
|
13
|
+
[:a_follows_b, Integer],
|
14
|
+
[:b_follows_a, Integer],
|
15
|
+
[:a_replies_b, Integer],
|
16
|
+
[:b_replies_a, Integer],
|
17
|
+
[:a_atsigns_b, Integer],
|
18
|
+
[:b_atsigns_a, Integer],
|
19
|
+
[:a_retweets_b, Integer],
|
20
|
+
[:b_retweets_a, Integer],
|
21
|
+
[:a_favorites_b, Integer],
|
22
|
+
[:b_favorites_a, Integer]
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/lib/wukong/rdf.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
module Wukong
|
2
|
+
#
|
3
|
+
# Dump wukong object as RDF triples:
|
4
|
+
#
|
5
|
+
# <key attr val module Wukong
|
6
|
+
#
|
7
|
+
# Dump wukong object as RDF triples:
|
8
|
+
#
|
9
|
+
# <key> <attr> <val> # <extra>
|
10
|
+
#
|
11
|
+
# Each element of the triple is XML encoded such that it contains no tab,
|
12
|
+
# newline or carriage returns, and the three are tab-separated. Any extra
|
13
|
+
# fields -- reification info, for instance -- are appended as a comment.
|
14
|
+
#
|
15
|
+
# This makes the result not only a valid RDF triple file but perfectly
|
16
|
+
# palatable to Wukong for further processing.
|
17
|
+
#
|
18
|
+
module Rdf
|
19
|
+
|
20
|
+
#
|
21
|
+
# RDF-formatted date
|
22
|
+
#
|
23
|
+
def self.encode_datetime dt
|
24
|
+
DateTime.parse_safely(dt).xmlschema
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Emit a component (subject or object) with the right semantic encoding
|
29
|
+
#
|
30
|
+
# Use :boolskip if a false property should just be left out.
|
31
|
+
#
|
32
|
+
def rdf_component val, type
|
33
|
+
case type
|
34
|
+
when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
|
35
|
+
when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
|
36
|
+
when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
|
37
|
+
when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
|
38
|
+
when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
|
39
|
+
when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
|
40
|
+
when :str then %Q{"#{val}"}
|
41
|
+
else raise "Don't know how to encode #{type}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Express relationship (predicate) in RDF
|
47
|
+
#
|
48
|
+
def rdf_pred pred
|
49
|
+
case pred
|
50
|
+
when :created_at then %Q{<http://twitter.com/##{pred}>}
|
51
|
+
else %Q{<http://twitter.com/##{pred}>}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# RDF Triple string for the given (subject, object, predicate)
|
57
|
+
# http://www.w3.org/TR/rdf-testcases/#ntriples
|
58
|
+
#
|
59
|
+
def self.rdf_triple subj, pred, obj, comment=nil
|
60
|
+
comment = "\t# " + comment.to_s unless comment.blank?
|
61
|
+
%Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
|
62
|
+
end
|
63
|
+
|
64
|
+
def mutable?(attr)
|
65
|
+
false
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Extract [subject, predicate, object, (extra)] tuples.
|
70
|
+
#
|
71
|
+
# (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
|
72
|
+
#
|
73
|
+
def to_rdf3_tuples
|
74
|
+
members_with_types.map do |attr, type|
|
75
|
+
next if self[attr].blank?
|
76
|
+
subj = rdf_resource
|
77
|
+
pred = rdf_pred(attr)
|
78
|
+
obj = rdf_component(self[attr], type) or next
|
79
|
+
comment = scraped_at if mutable?(attr)
|
80
|
+
[subj, pred, obj, comment]
|
81
|
+
end.compact
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Convert an object to an rdf triple.
|
86
|
+
#
|
87
|
+
# Appends scraped at to #mutable? attributes
|
88
|
+
#
|
89
|
+
def to_rdf3
|
90
|
+
to_rdf3_tuples.map do |tuple|
|
91
|
+
self.class.rdf_triple tuple
|
92
|
+
end.join("\n")
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
end
|
97
|
+
>
|
98
|
+
#
|
99
|
+
#
|
100
|
+
module Rdf
|
101
|
+
def to_rdf
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Wukong
|
2
|
+
#
|
3
|
+
# Export model's structure for other data frameworks:
|
4
|
+
# SQL and Pig
|
5
|
+
#
|
6
|
+
module Schema
|
7
|
+
def to_sql
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
# Export schema as Pig
|
12
|
+
def to_pig
|
13
|
+
members.zip(mtypes).map do |member, type|
|
14
|
+
member.to_s + ': ' + type.to_pig
|
15
|
+
end.join(', ')
|
16
|
+
end
|
17
|
+
|
18
|
+
def pig_klass
|
19
|
+
self.to_s.gsub(/.*::/, '')
|
20
|
+
end
|
21
|
+
|
22
|
+
def pig_load filename=nil
|
23
|
+
cmd = [
|
24
|
+
"%-23s" % pig_klass,
|
25
|
+
"= LOAD", filename || pig_klass.underscore.pluralize,
|
26
|
+
"AS ( rsrc:chararray,", self.to_pig, ')',
|
27
|
+
].join(" ")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class << Integer ; def to_pig() 'int' end ; end
|
33
|
+
class << Bignum ; def to_pig() 'long' end ; end
|
34
|
+
class << Float ; def to_pig() 'float' end ; end
|
35
|
+
class << String ; def to_pig() 'chararray' end ; end
|
36
|
+
class << Symbol ; def to_pig() self end ; end
|
37
|
+
class << Date ; def to_pig() 'long' end ; end
|
@@ -0,0 +1,265 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'wukong/script/hadoop_command'
|
3
|
+
require 'wukong/script/local_command'
|
4
|
+
require 'rbconfig'
|
5
|
+
module Wukong
|
6
|
+
|
7
|
+
# == How to run a Wukong script
|
8
|
+
#
|
9
|
+
# your/script.rb --run path/to/input_files path/to/output_dir
|
10
|
+
#
|
11
|
+
# All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
|
12
|
+
#
|
13
|
+
# == Command-line options
|
14
|
+
#
|
15
|
+
# If you'd like to listen for any command-line options, specify them at the
|
16
|
+
# command line:
|
17
|
+
#
|
18
|
+
# your/script.rb --my_bool_opt --my_val_taking_opt=val \
|
19
|
+
# --run path/to/input_files path/to/output_dir
|
20
|
+
#
|
21
|
+
# In this case the options hash for both Mapper and Reducer will contain
|
22
|
+
#
|
23
|
+
# :my_bool_opt => true,
|
24
|
+
# :my_val_taking_opt => 'val'
|
25
|
+
#
|
26
|
+
# == Complicated input paths
|
27
|
+
#
|
28
|
+
# To use more than one file as input, you can use normal * ? [] wildcards or
|
29
|
+
# give a comma-separated list -- see the hadoop documentation for syntax.
|
30
|
+
#
|
31
|
+
# == Run locally (--run=local)
|
32
|
+
#
|
33
|
+
# To run your script locally, use --run=local
|
34
|
+
#
|
35
|
+
# your/script.rb --run=local path/to/input_files path/to/output_dir
|
36
|
+
#
|
37
|
+
# This will pipe the contents of path/to/input_files through first your
|
38
|
+
# mapper, then sort, then the reducer, storing the results in the given output
|
39
|
+
# directory.
|
40
|
+
#
|
41
|
+
# All paths refer to the /local/ filesystem -- hadoop is never involved and in
|
42
|
+
# fact doesn't even have to be installed.
|
43
|
+
#
|
44
|
+
# == How to test your scripts
|
45
|
+
#
|
46
|
+
# You can supply the --map argument in place of --run to run the mapper on its
|
47
|
+
# own (and similarly, --reduce to run the reducer standalone):
|
48
|
+
#
|
49
|
+
# cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
|
50
|
+
#
|
51
|
+
# or, if your test data lies on the HDFS,
|
52
|
+
#
|
53
|
+
# hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
|
54
|
+
#
|
55
|
+
#
|
56
|
+
class Script
|
57
|
+
include Wukong::HadoopCommand
|
58
|
+
include Wukong::LocalCommand
|
59
|
+
attr_accessor :mapper_klass, :reducer_klass, :options
|
60
|
+
|
61
|
+
#
|
62
|
+
# Instantiate the Script with the Mapper and the Reducer class (each a
|
63
|
+
# Wukong::Streamer) it should call back.
|
64
|
+
#
|
65
|
+
#
|
66
|
+
# == Identity or External program as map or reduce
|
67
|
+
#
|
68
|
+
# To use the identity reducer ('cat'), instantiate your Script class with
|
69
|
+
# +nil+ as the reducer class. (And similarly to use an identity mapper,
|
70
|
+
# supply +nil+ for the mapper class.)
|
71
|
+
#
|
72
|
+
# To use an external program as your reducer (mapper), subclass the
|
73
|
+
# reduce_command (map_command) method to return the full command line
|
74
|
+
# expression to call.
|
75
|
+
#
|
76
|
+
# class MyMapper < Wukong::Streamer::Base
|
77
|
+
# # ... awesome stuff ...
|
78
|
+
# end
|
79
|
+
#
|
80
|
+
# class MyScript < Wukong::Script
|
81
|
+
# # prefix each unique line with the count of its occurrences.
|
82
|
+
# def reduce_command
|
83
|
+
# '/usr/bin/uniq -c'
|
84
|
+
# end
|
85
|
+
# end
|
86
|
+
# MyScript.new(MyMapper, nil).run
|
87
|
+
#
|
88
|
+
def initialize mapper_klass, reducer_klass, extra_options={}
|
89
|
+
self.options = default_options.merge(extra_options)
|
90
|
+
process_argv!
|
91
|
+
self.mapper_klass = mapper_klass
|
92
|
+
self.reducer_klass = reducer_klass
|
93
|
+
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
94
|
+
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Gives default options. Command line parameters take precedence
|
99
|
+
#
|
100
|
+
# MAKE SURE YOU CALL SUPER: write your script according to the patter
|
101
|
+
#
|
102
|
+
# super.merge :my_option => :val
|
103
|
+
#
|
104
|
+
def default_options
|
105
|
+
Wukong::CONFIG[:runner_defaults] || {}
|
106
|
+
end
|
107
|
+
|
108
|
+
# Options that don't need to go in the :all_args hash
|
109
|
+
def std_options
|
110
|
+
@std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Parse the command-line args into the options hash.
|
115
|
+
#
|
116
|
+
# I should not reinvent the wheel.
|
117
|
+
# Yet: here we are.
|
118
|
+
#
|
119
|
+
# '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
|
120
|
+
# '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
|
121
|
+
#
|
122
|
+
# options[:all_args] contains all arguments that are not in std_options
|
123
|
+
# options[:rest] contains all arguments following the first non-flag (or the '--')
|
124
|
+
#
|
125
|
+
def process_argv!
|
126
|
+
options[:all_args] = []
|
127
|
+
args = ARGV.dup
|
128
|
+
while args do
|
129
|
+
arg = args.shift
|
130
|
+
case
|
131
|
+
when arg == '--'
|
132
|
+
break
|
133
|
+
when arg =~ /\A--(\w+)(?:=(.+))?\z/
|
134
|
+
opt, val = [$1, $2]
|
135
|
+
opt = opt.to_sym
|
136
|
+
val ||= true
|
137
|
+
self.options[opt] = val
|
138
|
+
options[:all_args] << arg unless std_options.include?(opt)
|
139
|
+
else
|
140
|
+
args.unshift(arg) ; break
|
141
|
+
end
|
142
|
+
end
|
143
|
+
options[:all_args] = options[:all_args].join(" ")
|
144
|
+
options[:rest] = args
|
145
|
+
end
|
146
|
+
|
147
|
+
def this_script_filename
|
148
|
+
Pathname.new($0).realpath
|
149
|
+
end
|
150
|
+
|
151
|
+
def ruby_interpreter_path
|
152
|
+
Pathname.new(
|
153
|
+
File.join(Config::CONFIG["bindir"],
|
154
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+
|
155
|
+
Config::CONFIG["EXEEXT"])
|
156
|
+
).realpath
|
157
|
+
end
|
158
|
+
|
159
|
+
#
|
160
|
+
# by default, call this script in --map mode
|
161
|
+
#
|
162
|
+
def map_command
|
163
|
+
case
|
164
|
+
when mapper_klass
|
165
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
|
166
|
+
else options[:map_command] || Wukong::CONFIG[:default_mapper] end
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Shell command for reduce phase
|
171
|
+
# by default, call this script in --reduce mode
|
172
|
+
#
|
173
|
+
def reduce_command
|
174
|
+
case
|
175
|
+
when reducer_klass
|
176
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
|
177
|
+
else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
|
178
|
+
end
|
179
|
+
|
180
|
+
#
|
181
|
+
# Shell command to re-run in mapreduce mode using --map and --reduce
|
182
|
+
#
|
183
|
+
def runner_command input_path, output_path
|
184
|
+
# run as either local or hadoop
|
185
|
+
case run_mode
|
186
|
+
when 'local'
|
187
|
+
$stderr.puts " Reading STDIN / Writing STDOUT"
|
188
|
+
command = local_command input_path, output_path
|
189
|
+
when 'hadoop', 'mapred'
|
190
|
+
$stderr.puts " Launching hadoop as"
|
191
|
+
command = hadoop_command input_path, output_path
|
192
|
+
else
|
193
|
+
raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def run_mode
|
198
|
+
# if only --run is given, assume default run mode
|
199
|
+
options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
|
200
|
+
options[:run].to_s
|
201
|
+
end
|
202
|
+
|
203
|
+
def input_output_paths
|
204
|
+
# input / output paths
|
205
|
+
input_path, output_path = options[:rest][0..1]
|
206
|
+
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:fake]) && (input_path.blank? || output_path.blank?)
|
207
|
+
[input_path, output_path]
|
208
|
+
end
|
209
|
+
|
210
|
+
def maybe_overwrite_output_paths! output_path
|
211
|
+
if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
|
212
|
+
$stderr.puts "Removing output file #{output_path}"
|
213
|
+
`hdp-rm -r '#{output_path}'`
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Execute the runner phase
|
219
|
+
#
|
220
|
+
def exec_hadoop_streaming
|
221
|
+
$stderr.puts "Streaming on self"
|
222
|
+
input_path, output_path = input_output_paths
|
223
|
+
maybe_overwrite_output_paths! output_path
|
224
|
+
command = runner_command(input_path, output_path)
|
225
|
+
$stderr.puts command
|
226
|
+
if ! options[:fake]
|
227
|
+
$stdout.puts `#{command}`
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
#
|
232
|
+
# If --map or --reduce, dispatch to the mapper or reducer.
|
233
|
+
# Otherwise,
|
234
|
+
#
|
235
|
+
def run
|
236
|
+
case
|
237
|
+
when options[:map]
|
238
|
+
mapper_klass.new(self.options).stream
|
239
|
+
when options[:reduce]
|
240
|
+
reducer_klass.new(self.options).stream
|
241
|
+
when options[:run]
|
242
|
+
exec_hadoop_streaming
|
243
|
+
else
|
244
|
+
self.help # Normant Vincent Peale is proud of you
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
#
|
249
|
+
# Command line usage
|
250
|
+
#
|
251
|
+
def help
|
252
|
+
$stderr.puts "#{self.class} script"
|
253
|
+
$stderr.puts %Q{
|
254
|
+
#{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
|
255
|
+
#{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
|
256
|
+
#{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
|
257
|
+
#{$0} --map
|
258
|
+
#{$0} --reduce # dispatch to the mapper or reducer
|
259
|
+
|
260
|
+
You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
|
261
|
+
}
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
end
|