wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'wukong/extensions/class'
|
|
2
|
+
module Wukong
|
|
3
|
+
|
|
4
|
+
module HashlikeClass
|
|
5
|
+
module ClassMethods
|
|
6
|
+
def has_members *members
|
|
7
|
+
self.members ||= []
|
|
8
|
+
self.members = members.map(&:to_s) + self.members
|
|
9
|
+
self.members.each do |member|
|
|
10
|
+
attr_accessor member.to_sym
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
alias_method :has_member, :has_members
|
|
14
|
+
def keys
|
|
15
|
+
members
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def [](key)
|
|
20
|
+
self.send(key)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def []=(key, val)
|
|
24
|
+
self.send("#{key}=", val)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def to_a
|
|
28
|
+
values_of(*members)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def to_flat
|
|
32
|
+
to_a.map(&:to_flat).flatten
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def self.included base
|
|
36
|
+
base.class_eval do
|
|
37
|
+
extend ClassMethods
|
|
38
|
+
include HashLike
|
|
39
|
+
class_inheritable_accessor :members
|
|
40
|
+
|
|
41
|
+
def to_hash *args
|
|
42
|
+
super(*args).merge 'type' => self.class.to_s
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'pathname'
|
|
2
|
+
class Pathname
|
|
3
|
+
# Append path segments and expand to absolute path
|
|
4
|
+
#
|
|
5
|
+
# file = Pathname(Dir.pwd) / "subdir1" / :subdir2 / "filename.ext"
|
|
6
|
+
#
|
|
7
|
+
# @param [Pathname, String, #to_s] path path segment to concatenate with receiver
|
|
8
|
+
#
|
|
9
|
+
# @return [Pathname]
|
|
10
|
+
# receiver with _path_ appended and expanded to an absolute path
|
|
11
|
+
#
|
|
12
|
+
# @api public
|
|
13
|
+
def /(path)
|
|
14
|
+
(self + path).expand_path
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def self.[](*vals)
|
|
18
|
+
new( File.join(vals) )
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class Subdir < Pathname
|
|
23
|
+
def self.[](*vals)
|
|
24
|
+
dir = File.dirname(vals.shift)
|
|
25
|
+
new(File.join(dir, *vals))
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#
|
|
2
|
+
# String Monkeypatched for processing with wukong: see wukong/extensions/string
|
|
3
|
+
#
|
|
4
|
+
String.class_eval do
|
|
5
|
+
# By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
|
|
6
|
+
# is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
|
|
7
|
+
#
|
|
8
|
+
# +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
|
|
9
|
+
#
|
|
10
|
+
# Examples:
|
|
11
|
+
# "active_record".camelize # => "ActiveRecord"
|
|
12
|
+
# "active_record".camelize(:lower) # => "activeRecord"
|
|
13
|
+
# "active_record/errors".camelize # => "ActiveRecord::Errors"
|
|
14
|
+
# "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
|
|
15
|
+
def camelize(first_letter_in_uppercase = true)
|
|
16
|
+
if first_letter_in_uppercase
|
|
17
|
+
self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
|
18
|
+
else
|
|
19
|
+
self.first + camelize(self)[1..-1]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
#
|
|
24
|
+
# The reverse of +camelize+. Makes an underscored, lowercase form from the expression in the string.
|
|
25
|
+
#
|
|
26
|
+
# Changes '::' to '/' to convert namespaces to paths.
|
|
27
|
+
#
|
|
28
|
+
# Examples:
|
|
29
|
+
# "ActiveRecord".underscore # => "active_record"
|
|
30
|
+
# "ActiveRecord::Errors".underscore # => active_record/errors
|
|
31
|
+
#
|
|
32
|
+
# Stolen from active_support
|
|
33
|
+
#
|
|
34
|
+
def underscore
|
|
35
|
+
gsub(/::/, '/').
|
|
36
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
|
37
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
|
38
|
+
tr("-", "_").
|
|
39
|
+
downcase
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Tries to find a constant with the name specified in the argument string:
|
|
43
|
+
#
|
|
44
|
+
# "Module".constantize # => Module
|
|
45
|
+
# "Test::Unit".constantize # => Test::Unit
|
|
46
|
+
#
|
|
47
|
+
# The name is assumed to be the one of a top-level constant, no matter whether
|
|
48
|
+
# it starts with "::" or not. No lexical context is taken into account:
|
|
49
|
+
#
|
|
50
|
+
# C = 'outside'
|
|
51
|
+
# module M
|
|
52
|
+
# C = 'inside'
|
|
53
|
+
# C # => 'inside'
|
|
54
|
+
# "C".constantize # => 'outside', same as ::C
|
|
55
|
+
# end
|
|
56
|
+
#
|
|
57
|
+
# NameError is raised when the name is not in CamelCase or the constant is
|
|
58
|
+
# unknown.
|
|
59
|
+
def constantize
|
|
60
|
+
unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ self
|
|
61
|
+
raise NameError, "#{self.inspect} is not a valid constant name!"
|
|
62
|
+
end
|
|
63
|
+
Object.module_eval("::#{$1}", __FILE__, __LINE__)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wukong/extensions/hash'
|
|
2
|
+
require 'wukong/extensions/hash_like'
|
|
3
|
+
require 'wukong/extensions/symbol'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# extensions/struct
|
|
7
|
+
#
|
|
8
|
+
# Add several methods to make a struct duck-type much more like a Hash
|
|
9
|
+
#
|
|
10
|
+
Struct.class_eval do
|
|
11
|
+
include Wukong::HashLike
|
|
12
|
+
def self.keys
|
|
13
|
+
members
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. extensions/symbol.rb -- extensions to symbol class
|
|
3
|
+
#
|
|
4
|
+
class Symbol
|
|
5
|
+
#
|
|
6
|
+
# Turn the symbol into a simple proc (stolen from
|
|
7
|
+
# <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
|
|
8
|
+
def to_proc
|
|
9
|
+
Proc.new { |*args| args.shift.__send__(self, *args) }
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
# Common logger
|
|
3
|
+
#
|
|
4
|
+
# Set your own at any time with
|
|
5
|
+
# Wukong.logger = YourAwesomeLogger.new(...)
|
|
6
|
+
# If you have log4r installed you can use
|
|
7
|
+
# Wukong.logger = Wukong.default_log4r_logger
|
|
8
|
+
#
|
|
9
|
+
# If Wukong.logger is too much typing for you,
|
|
10
|
+
# use the Log constant
|
|
11
|
+
#
|
|
12
|
+
# Default format:
|
|
13
|
+
# I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
|
|
14
|
+
#
|
|
15
|
+
def self.logger
|
|
16
|
+
@logger ||= default_ruby_logger
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#
|
|
20
|
+
# Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
|
|
21
|
+
# friendly) output lines
|
|
22
|
+
#
|
|
23
|
+
def self.default_log4r_logger logger_handle='wukong'
|
|
24
|
+
require 'log4r'
|
|
25
|
+
lgr = Log4r::Logger.new logger_handle
|
|
26
|
+
outputter = Log4r::Outputter.stderr
|
|
27
|
+
# Define timestamp formatter method
|
|
28
|
+
::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
|
|
29
|
+
# 2009-07-25T00:12:05Z INFO PID\t
|
|
30
|
+
outputter.formatter = Log4r::PatternFormatter.new(
|
|
31
|
+
:pattern => "%d %.4l #{Process.pid}\t%.2000m",
|
|
32
|
+
:date_method => :utc_iso8601
|
|
33
|
+
)
|
|
34
|
+
lgr.outputters = outputter
|
|
35
|
+
lgr
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.default_ruby_logger
|
|
39
|
+
require 'logger'
|
|
40
|
+
Logger.new STDERR
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.logger= logger
|
|
44
|
+
@logger = logger
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
#
|
|
49
|
+
# A convenient logger.
|
|
50
|
+
#
|
|
51
|
+
# Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
|
|
52
|
+
#
|
|
53
|
+
Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
module Wukong
|
|
3
|
+
module Models
|
|
4
|
+
class Edge < TypedStruct.new(
|
|
5
|
+
[:src, Integer],
|
|
6
|
+
[:dest, Integer]
|
|
7
|
+
)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class MultiEdge < TypedStruct.new(
|
|
11
|
+
[:src, Integer],
|
|
12
|
+
[:dest, Integer],
|
|
13
|
+
[:a_follows_b, Integer],
|
|
14
|
+
[:b_follows_a, Integer],
|
|
15
|
+
[:a_replies_b, Integer],
|
|
16
|
+
[:b_replies_a, Integer],
|
|
17
|
+
[:a_atsigns_b, Integer],
|
|
18
|
+
[:b_atsigns_a, Integer],
|
|
19
|
+
[:a_retweets_b, Integer],
|
|
20
|
+
[:b_retweets_a, Integer],
|
|
21
|
+
[:a_favorites_b, Integer],
|
|
22
|
+
[:b_favorites_a, Integer]
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
end
|
data/lib/wukong/rdf.rb
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
#
|
|
3
|
+
# Dump wukong object as RDF triples:
|
|
4
|
+
#
|
|
5
|
+
# <key attr val module Wukong
|
|
6
|
+
#
|
|
7
|
+
# Dump wukong object as RDF triples:
|
|
8
|
+
#
|
|
9
|
+
# <key> <attr> <val> # <extra>
|
|
10
|
+
#
|
|
11
|
+
# Each element of the triple is XML encoded such that it contains no tab,
|
|
12
|
+
# newline or carriage returns, and the three are tab-separated. Any extra
|
|
13
|
+
# fields -- reification info, for instance -- are appended as a comment.
|
|
14
|
+
#
|
|
15
|
+
# This makes the result not only a valid RDF triple file but perfectly
|
|
16
|
+
# palatable to Wukong for further processing.
|
|
17
|
+
#
|
|
18
|
+
module Rdf
|
|
19
|
+
|
|
20
|
+
#
|
|
21
|
+
# RDF-formatted date
|
|
22
|
+
#
|
|
23
|
+
def self.encode_datetime dt
|
|
24
|
+
DateTime.parse_safely(dt).xmlschema
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
#
|
|
28
|
+
# Emit a component (subject or object) with the right semantic encoding
|
|
29
|
+
#
|
|
30
|
+
# Use :boolskip if a false property should just be left out.
|
|
31
|
+
#
|
|
32
|
+
def rdf_component val, type
|
|
33
|
+
case type
|
|
34
|
+
when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
|
|
35
|
+
when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
|
|
36
|
+
when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
|
|
37
|
+
when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
|
|
38
|
+
when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
|
|
39
|
+
when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
|
|
40
|
+
when :str then %Q{"#{val}"}
|
|
41
|
+
else raise "Don't know how to encode #{type}"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
#
|
|
46
|
+
# Express relationship (predicate) in RDF
|
|
47
|
+
#
|
|
48
|
+
def rdf_pred pred
|
|
49
|
+
case pred
|
|
50
|
+
when :created_at then %Q{<http://twitter.com/##{pred}>}
|
|
51
|
+
else %Q{<http://twitter.com/##{pred}>}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
#
|
|
56
|
+
# RDF Triple string for the given (subject, object, predicate)
|
|
57
|
+
# http://www.w3.org/TR/rdf-testcases/#ntriples
|
|
58
|
+
#
|
|
59
|
+
def self.rdf_triple subj, pred, obj, comment=nil
|
|
60
|
+
comment = "\t# " + comment.to_s unless comment.blank?
|
|
61
|
+
%Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def mutable?(attr)
|
|
65
|
+
false
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
#
|
|
69
|
+
# Extract [subject, predicate, object, (extra)] tuples.
|
|
70
|
+
#
|
|
71
|
+
# (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
|
|
72
|
+
#
|
|
73
|
+
def to_rdf3_tuples
|
|
74
|
+
members_with_types.map do |attr, type|
|
|
75
|
+
next if self[attr].blank?
|
|
76
|
+
subj = rdf_resource
|
|
77
|
+
pred = rdf_pred(attr)
|
|
78
|
+
obj = rdf_component(self[attr], type) or next
|
|
79
|
+
comment = scraped_at if mutable?(attr)
|
|
80
|
+
[subj, pred, obj, comment]
|
|
81
|
+
end.compact
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
#
|
|
85
|
+
# Convert an object to an rdf triple.
|
|
86
|
+
#
|
|
87
|
+
# Appends scraped at to #mutable? attributes
|
|
88
|
+
#
|
|
89
|
+
def to_rdf3
|
|
90
|
+
to_rdf3_tuples.map do |tuple|
|
|
91
|
+
self.class.rdf_triple tuple
|
|
92
|
+
end.join("\n")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
>
|
|
98
|
+
#
|
|
99
|
+
#
|
|
100
|
+
module Rdf
|
|
101
|
+
def to_rdf
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
#
|
|
3
|
+
# Export model's structure for other data frameworks:
|
|
4
|
+
# SQL and Pig
|
|
5
|
+
#
|
|
6
|
+
module Schema
|
|
7
|
+
def to_sql
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Export schema as Pig
|
|
12
|
+
def to_pig
|
|
13
|
+
members.zip(mtypes).map do |member, type|
|
|
14
|
+
member.to_s + ': ' + type.to_pig
|
|
15
|
+
end.join(', ')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def pig_klass
|
|
19
|
+
self.to_s.gsub(/.*::/, '')
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def pig_load filename=nil
|
|
23
|
+
cmd = [
|
|
24
|
+
"%-23s" % pig_klass,
|
|
25
|
+
"= LOAD", filename || pig_klass.underscore.pluralize,
|
|
26
|
+
"AS ( rsrc:chararray,", self.to_pig, ')',
|
|
27
|
+
].join(" ")
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
class << Integer ; def to_pig() 'int' end ; end
|
|
33
|
+
class << Bignum ; def to_pig() 'long' end ; end
|
|
34
|
+
class << Float ; def to_pig() 'float' end ; end
|
|
35
|
+
class << String ; def to_pig() 'chararray' end ; end
|
|
36
|
+
class << Symbol ; def to_pig() self end ; end
|
|
37
|
+
class << Date ; def to_pig() 'long' end ; end
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
require 'pathname'
|
|
2
|
+
require 'wukong/script/hadoop_command'
|
|
3
|
+
require 'wukong/script/local_command'
|
|
4
|
+
require 'rbconfig'
|
|
5
|
+
module Wukong
|
|
6
|
+
|
|
7
|
+
# == How to run a Wukong script
|
|
8
|
+
#
|
|
9
|
+
# your/script.rb --run path/to/input_files path/to/output_dir
|
|
10
|
+
#
|
|
11
|
+
# All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
|
|
12
|
+
#
|
|
13
|
+
# == Command-line options
|
|
14
|
+
#
|
|
15
|
+
# If you'd like to listen for any command-line options, specify them at the
|
|
16
|
+
# command line:
|
|
17
|
+
#
|
|
18
|
+
# your/script.rb --my_bool_opt --my_val_taking_opt=val \
|
|
19
|
+
# --run path/to/input_files path/to/output_dir
|
|
20
|
+
#
|
|
21
|
+
# In this case the options hash for both Mapper and Reducer will contain
|
|
22
|
+
#
|
|
23
|
+
# :my_bool_opt => true,
|
|
24
|
+
# :my_val_taking_opt => 'val'
|
|
25
|
+
#
|
|
26
|
+
# == Complicated input paths
|
|
27
|
+
#
|
|
28
|
+
# To use more than one file as input, you can use normal * ? [] wildcards or
|
|
29
|
+
# give a comma-separated list -- see the hadoop documentation for syntax.
|
|
30
|
+
#
|
|
31
|
+
# == Run locally (--run=local)
|
|
32
|
+
#
|
|
33
|
+
# To run your script locally, use --run=local
|
|
34
|
+
#
|
|
35
|
+
# your/script.rb --run=local path/to/input_files path/to/output_dir
|
|
36
|
+
#
|
|
37
|
+
# This will pipe the contents of path/to/input_files through first your
|
|
38
|
+
# mapper, then sort, then the reducer, storing the results in the given output
|
|
39
|
+
# directory.
|
|
40
|
+
#
|
|
41
|
+
# All paths refer to the /local/ filesystem -- hadoop is never involved and in
|
|
42
|
+
# fact doesn't even have to be installed.
|
|
43
|
+
#
|
|
44
|
+
# == How to test your scripts
|
|
45
|
+
#
|
|
46
|
+
# You can supply the --map argument in place of --run to run the mapper on its
|
|
47
|
+
# own (and similarly, --reduce to run the reducer standalone):
|
|
48
|
+
#
|
|
49
|
+
# cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
|
|
50
|
+
#
|
|
51
|
+
# or, if your test data lies on the HDFS,
|
|
52
|
+
#
|
|
53
|
+
# hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
|
|
54
|
+
#
|
|
55
|
+
#
|
|
56
|
+
class Script
|
|
57
|
+
include Wukong::HadoopCommand
|
|
58
|
+
include Wukong::LocalCommand
|
|
59
|
+
attr_accessor :mapper_klass, :reducer_klass, :options
|
|
60
|
+
|
|
61
|
+
#
|
|
62
|
+
# Instantiate the Script with the Mapper and the Reducer class (each a
|
|
63
|
+
# Wukong::Streamer) it should call back.
|
|
64
|
+
#
|
|
65
|
+
#
|
|
66
|
+
# == Identity or External program as map or reduce
|
|
67
|
+
#
|
|
68
|
+
# To use the identity reducer ('cat'), instantiate your Script class with
|
|
69
|
+
# +nil+ as the reducer class. (And similarly to use an identity mapper,
|
|
70
|
+
# supply +nil+ for the mapper class.)
|
|
71
|
+
#
|
|
72
|
+
# To use an external program as your reducer (mapper), subclass the
|
|
73
|
+
# reduce_command (map_command) method to return the full command line
|
|
74
|
+
# expression to call.
|
|
75
|
+
#
|
|
76
|
+
# class MyMapper < Wukong::Streamer::Base
|
|
77
|
+
# # ... awesome stuff ...
|
|
78
|
+
# end
|
|
79
|
+
#
|
|
80
|
+
# class MyScript < Wukong::Script
|
|
81
|
+
# # prefix each unique line with the count of its occurrences.
|
|
82
|
+
# def reduce_command
|
|
83
|
+
# '/usr/bin/uniq -c'
|
|
84
|
+
# end
|
|
85
|
+
# end
|
|
86
|
+
# MyScript.new(MyMapper, nil).run
|
|
87
|
+
#
|
|
88
|
+
def initialize mapper_klass, reducer_klass, extra_options={}
|
|
89
|
+
self.options = default_options.merge(extra_options)
|
|
90
|
+
process_argv!
|
|
91
|
+
self.mapper_klass = mapper_klass
|
|
92
|
+
self.reducer_klass = reducer_klass
|
|
93
|
+
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
|
94
|
+
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
#
|
|
98
|
+
# Gives default options. Command line parameters take precedence
|
|
99
|
+
#
|
|
100
|
+
# MAKE SURE YOU CALL SUPER: write your script according to the patter
|
|
101
|
+
#
|
|
102
|
+
# super.merge :my_option => :val
|
|
103
|
+
#
|
|
104
|
+
def default_options
|
|
105
|
+
Wukong::CONFIG[:runner_defaults] || {}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Options that don't need to go in the :all_args hash
|
|
109
|
+
def std_options
|
|
110
|
+
@std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
#
|
|
114
|
+
# Parse the command-line args into the options hash.
|
|
115
|
+
#
|
|
116
|
+
# I should not reinvent the wheel.
|
|
117
|
+
# Yet: here we are.
|
|
118
|
+
#
|
|
119
|
+
# '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
|
|
120
|
+
# '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
|
|
121
|
+
#
|
|
122
|
+
# options[:all_args] contains all arguments that are not in std_options
|
|
123
|
+
# options[:rest] contains all arguments following the first non-flag (or the '--')
|
|
124
|
+
#
|
|
125
|
+
def process_argv!
|
|
126
|
+
options[:all_args] = []
|
|
127
|
+
args = ARGV.dup
|
|
128
|
+
while args do
|
|
129
|
+
arg = args.shift
|
|
130
|
+
case
|
|
131
|
+
when arg == '--'
|
|
132
|
+
break
|
|
133
|
+
when arg =~ /\A--(\w+)(?:=(.+))?\z/
|
|
134
|
+
opt, val = [$1, $2]
|
|
135
|
+
opt = opt.to_sym
|
|
136
|
+
val ||= true
|
|
137
|
+
self.options[opt] = val
|
|
138
|
+
options[:all_args] << arg unless std_options.include?(opt)
|
|
139
|
+
else
|
|
140
|
+
args.unshift(arg) ; break
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
options[:all_args] = options[:all_args].join(" ")
|
|
144
|
+
options[:rest] = args
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def this_script_filename
|
|
148
|
+
Pathname.new($0).realpath
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def ruby_interpreter_path
|
|
152
|
+
Pathname.new(
|
|
153
|
+
File.join(Config::CONFIG["bindir"],
|
|
154
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+
|
|
155
|
+
Config::CONFIG["EXEEXT"])
|
|
156
|
+
).realpath
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
#
|
|
160
|
+
# by default, call this script in --map mode
|
|
161
|
+
#
|
|
162
|
+
def map_command
|
|
163
|
+
case
|
|
164
|
+
when mapper_klass
|
|
165
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
|
|
166
|
+
else options[:map_command] || Wukong::CONFIG[:default_mapper] end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
#
|
|
170
|
+
# Shell command for reduce phase
|
|
171
|
+
# by default, call this script in --reduce mode
|
|
172
|
+
#
|
|
173
|
+
def reduce_command
|
|
174
|
+
case
|
|
175
|
+
when reducer_klass
|
|
176
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
|
|
177
|
+
else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
#
|
|
181
|
+
# Shell command to re-run in mapreduce mode using --map and --reduce
|
|
182
|
+
#
|
|
183
|
+
def runner_command input_path, output_path
|
|
184
|
+
# run as either local or hadoop
|
|
185
|
+
case run_mode
|
|
186
|
+
when 'local'
|
|
187
|
+
$stderr.puts " Reading STDIN / Writing STDOUT"
|
|
188
|
+
command = local_command input_path, output_path
|
|
189
|
+
when 'hadoop', 'mapred'
|
|
190
|
+
$stderr.puts " Launching hadoop as"
|
|
191
|
+
command = hadoop_command input_path, output_path
|
|
192
|
+
else
|
|
193
|
+
raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def run_mode
|
|
198
|
+
# if only --run is given, assume default run mode
|
|
199
|
+
options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
|
|
200
|
+
options[:run].to_s
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def input_output_paths
|
|
204
|
+
# input / output paths
|
|
205
|
+
input_path, output_path = options[:rest][0..1]
|
|
206
|
+
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:fake]) && (input_path.blank? || output_path.blank?)
|
|
207
|
+
[input_path, output_path]
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def maybe_overwrite_output_paths! output_path
|
|
211
|
+
if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
|
|
212
|
+
$stderr.puts "Removing output file #{output_path}"
|
|
213
|
+
`hdp-rm -r '#{output_path}'`
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
#
|
|
218
|
+
# Execute the runner phase
|
|
219
|
+
#
|
|
220
|
+
def exec_hadoop_streaming
|
|
221
|
+
$stderr.puts "Streaming on self"
|
|
222
|
+
input_path, output_path = input_output_paths
|
|
223
|
+
maybe_overwrite_output_paths! output_path
|
|
224
|
+
command = runner_command(input_path, output_path)
|
|
225
|
+
$stderr.puts command
|
|
226
|
+
if ! options[:fake]
|
|
227
|
+
$stdout.puts `#{command}`
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
#
|
|
232
|
+
# If --map or --reduce, dispatch to the mapper or reducer.
|
|
233
|
+
# Otherwise,
|
|
234
|
+
#
|
|
235
|
+
def run
|
|
236
|
+
case
|
|
237
|
+
when options[:map]
|
|
238
|
+
mapper_klass.new(self.options).stream
|
|
239
|
+
when options[:reduce]
|
|
240
|
+
reducer_klass.new(self.options).stream
|
|
241
|
+
when options[:run]
|
|
242
|
+
exec_hadoop_streaming
|
|
243
|
+
else
|
|
244
|
+
self.help # Normant Vincent Peale is proud of you
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
#
|
|
249
|
+
# Command line usage
|
|
250
|
+
#
|
|
251
|
+
def help
|
|
252
|
+
$stderr.puts "#{self.class} script"
|
|
253
|
+
$stderr.puts %Q{
|
|
254
|
+
#{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
|
|
255
|
+
#{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
|
|
256
|
+
#{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
|
|
257
|
+
#{$0} --map
|
|
258
|
+
#{$0} --reduce # dispatch to the mapper or reducer
|
|
259
|
+
|
|
260
|
+
You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
|
|
261
|
+
}
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
end
|