wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# == Load/StoreFunctions ==
|
|
2
|
+
# BinaryDeserializer
|
|
3
|
+
# BinarySerializer
|
|
4
|
+
# BinStorage
|
|
5
|
+
# PigStorage
|
|
6
|
+
# PigDump
|
|
7
|
+
# TextLoader
|
|
8
|
+
|
|
9
|
+
module Wukong
|
|
10
|
+
module AndPig
|
|
11
|
+
class PigVar
|
|
12
|
+
#===========================================================================
|
|
13
|
+
#
|
|
14
|
+
# The "LOAD" pig expression:
|
|
15
|
+
# MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
|
|
16
|
+
#
|
|
17
|
+
# The AS type spec is generated from klass
|
|
18
|
+
#
|
|
19
|
+
def self.pig_load rel, klass, options={ }
|
|
20
|
+
filename = options[:filename] || default_filename(rel)
|
|
21
|
+
self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
|
|
22
|
+
if options[:has_rsrc]
|
|
23
|
+
lval = self[rel]
|
|
24
|
+
lval.generate lval, *lval.fields
|
|
25
|
+
end
|
|
26
|
+
rel
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
#===========================================================================
|
|
30
|
+
#
|
|
31
|
+
#
|
|
32
|
+
# The "STORE" pig imperative:
|
|
33
|
+
# STORE Relation INTO 'filename'
|
|
34
|
+
# If no filename is given, the relation's name is used
|
|
35
|
+
#
|
|
36
|
+
def store filename=nil
|
|
37
|
+
filename ||= default_filename
|
|
38
|
+
self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
|
|
39
|
+
self
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Store the relation, removing the existing file
|
|
43
|
+
def store! filename=nil
|
|
44
|
+
filename ||= default_filename
|
|
45
|
+
rmf! filename
|
|
46
|
+
mkdir File.dirname(filename)
|
|
47
|
+
store filename
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Force a store to disk, then load (so all calculations proceed from there)
|
|
51
|
+
def checkpoint! options={}
|
|
52
|
+
options = options.reverse_merge :filename => default_filename
|
|
53
|
+
store! options[:filename]
|
|
54
|
+
self.class.pig_load(self.name, klass, options)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def default_filename
|
|
58
|
+
self.class.default_filename self.name
|
|
59
|
+
end
|
|
60
|
+
def self.default_filename name
|
|
61
|
+
File.join(working_dir, name.to_s)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# == DiagnosticOperators
|
|
2
|
+
# describe
|
|
3
|
+
# dump
|
|
4
|
+
# explain
|
|
5
|
+
# illustrate
|
|
6
|
+
# == UDFStatements
|
|
7
|
+
# define
|
|
8
|
+
# register
|
|
9
|
+
|
|
10
|
+
module Wukong
|
|
11
|
+
module AndPig
|
|
12
|
+
class PigVar
|
|
13
|
+
# DESCRIBE pig imperative
|
|
14
|
+
def describe
|
|
15
|
+
self.class.describe self
|
|
16
|
+
end
|
|
17
|
+
def self.describe rel
|
|
18
|
+
emit %Q{ -- PREDICTED: #{rel.klass.typify} }
|
|
19
|
+
simple_declaration :describe, rel.relationize
|
|
20
|
+
rel
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# DUMP pig imperative
|
|
24
|
+
def dump() simple_operation :dump end
|
|
25
|
+
|
|
26
|
+
# EXPLAIN pig imperative
|
|
27
|
+
def explain() simple_operation :explain end
|
|
28
|
+
|
|
29
|
+
# ILLUSTRATE pig imperative
|
|
30
|
+
def illustrate() simple_operation :illustrate end
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def self.define pig_alias, *args
|
|
34
|
+
emit_imperative :DEFINE, pig_alias, args
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.register path_to_jar
|
|
38
|
+
emit_imperative :REGISTER, path_to_jar
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# == RelationalOperators
|
|
3
|
+
#
|
|
4
|
+
# GROUP, COGROUP, JOIN see groupies.rb
|
|
5
|
+
# CROSS see
|
|
6
|
+
|
|
7
|
+
# distinct
|
|
8
|
+
# filter
|
|
9
|
+
# limit
|
|
10
|
+
# order
|
|
11
|
+
# split
|
|
12
|
+
# union
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# stream
|
|
16
|
+
# load
|
|
17
|
+
# store
|
|
18
|
+
#
|
|
19
|
+
module Wukong
|
|
20
|
+
module AndPig
|
|
21
|
+
class PigVar
|
|
22
|
+
|
|
23
|
+
# ===========================================================================
|
|
24
|
+
#
|
|
25
|
+
# Options
|
|
26
|
+
#
|
|
27
|
+
def self.parallelize! str, options
|
|
28
|
+
str << " PARALLEL #{options[:parallel]}" if options[:parallel]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# ===========================================================================
|
|
32
|
+
#
|
|
33
|
+
# DISTINCT
|
|
34
|
+
#
|
|
35
|
+
def distinct lval, options={}
|
|
36
|
+
self.class.distinct lval, self, options
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.distinct lval, rel, options={ }
|
|
40
|
+
cmd_str = rel.relationize
|
|
41
|
+
parallelize! cmd_str, options
|
|
42
|
+
simple_operation lval, rel, :distinct, cmd_str
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# ===========================================================================
|
|
46
|
+
#
|
|
47
|
+
# FILTER
|
|
48
|
+
#
|
|
49
|
+
def filter by_str
|
|
50
|
+
new_in_chain klass, "FILTER #{relation} BY #{by_str}"
|
|
51
|
+
end
|
|
52
|
+
def self.filter lval, rel, by_str
|
|
53
|
+
simple_operation lval, rel, "FILTER", "#{rel.relation} BY #{by_str}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# ===========================================================================
|
|
57
|
+
#
|
|
58
|
+
# LIMIT
|
|
59
|
+
#
|
|
60
|
+
def limit n
|
|
61
|
+
new_in_chain klass, "LIMIT #{relation} #{n}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# ===========================================================================
|
|
65
|
+
#
|
|
66
|
+
# ORDER
|
|
67
|
+
#
|
|
68
|
+
# alias = ORDER alias BY { * [ASC|DESC] |
|
|
69
|
+
# field_alias [ASC|DESC] [, field_alias [ASC|DESC] …]
|
|
70
|
+
# } [PARALLEL n];
|
|
71
|
+
#
|
|
72
|
+
def order cmd_str, options={}
|
|
73
|
+
result = new_in_chain klass, "ORDER #{relation} BY #{cmd_str}"
|
|
74
|
+
parallelize! result.cmd, options
|
|
75
|
+
result
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# ===========================================================================
|
|
79
|
+
#
|
|
80
|
+
# SPLIT
|
|
81
|
+
#
|
|
82
|
+
# SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
|
|
83
|
+
#
|
|
84
|
+
#
|
|
85
|
+
def split relation_tests={}
|
|
86
|
+
split_str = relation_tests.map do |out_rel, test|
|
|
87
|
+
"#{out_rel} IF #{test}"
|
|
88
|
+
end.join(", ")
|
|
89
|
+
new_in_chain klass, "SPLIT #{relation} INTO #{split_str}"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# ===========================================================================
|
|
93
|
+
#
|
|
94
|
+
# CROSS
|
|
95
|
+
#
|
|
96
|
+
def cross *relations
|
|
97
|
+
options = relations.extract_options!
|
|
98
|
+
raise CrossArgumentError unless relations.length >= 1
|
|
99
|
+
relations_str = [self, *relations].map(&:relation).join(", ")
|
|
100
|
+
result = new_in_chain relations.first.klass, "CROSS #{relations_str}"
|
|
101
|
+
parallelize! result.cmd, options
|
|
102
|
+
result
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# ===========================================================================
|
|
106
|
+
#
|
|
107
|
+
# UNION
|
|
108
|
+
#
|
|
109
|
+
# def self.union *relations
|
|
110
|
+
# raise UnionArgumentError unless relations.length >= 2
|
|
111
|
+
# new_in_chain relations.first.klass, "UNION #{relations}"
|
|
112
|
+
# end
|
|
113
|
+
|
|
114
|
+
# UNION as method
|
|
115
|
+
def union lval, *relations
|
|
116
|
+
self.class.union lval, [self]+relations
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def self.union lval, *relations
|
|
120
|
+
raise UnionArgumentError unless relations.length >= 2
|
|
121
|
+
relations_str = relations.map(&:relation).join(", ")
|
|
122
|
+
simple_operation lval, relations.first, :union, relations_str
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
end
|
|
126
|
+
CrossArgumentError = ArgumentError.new("CROSS requires at least two relations. Heh heh: relations.")
|
|
127
|
+
UnionArgumentError = ArgumentError.new("UNION requires at least two relations. Heh heh: relations.")
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module PigStructMethods
|
|
3
|
+
module ClassMethods
|
|
4
|
+
#
|
|
5
|
+
# Pig type string --
|
|
6
|
+
# the pig type strings for each sub-element.
|
|
7
|
+
#
|
|
8
|
+
def typify has_rsrc=nil
|
|
9
|
+
vars_str = members.zip(mtypes).map do |attr, mtype|
|
|
10
|
+
"%s: %s" % [attr, mtype.typify]
|
|
11
|
+
end
|
|
12
|
+
vars_str = ["rsrc: chararray"] + vars_str if has_rsrc
|
|
13
|
+
"(#{vars_str.join(', ')})"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
#
|
|
18
|
+
#
|
|
19
|
+
def pig_load rel, *args
|
|
20
|
+
Wukong::AndPig::PigVar.pig_load rel, self, *args
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
#
|
|
24
|
+
# Returns type for a fieldspec
|
|
25
|
+
#
|
|
26
|
+
def field_type field
|
|
27
|
+
case field
|
|
28
|
+
when Symbol then members_types[field]
|
|
29
|
+
# when Array
|
|
30
|
+
# if field.length > 1 then members_types[field.first].field_type(field[1..-1])
|
|
31
|
+
# else field_type field.first
|
|
32
|
+
# end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
end
|
|
37
|
+
def self.included base
|
|
38
|
+
base.extend ClassMethods
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
Struct.class_eval do
|
|
44
|
+
include Wukong::PigStructMethods
|
|
45
|
+
def self.mtypes
|
|
46
|
+
members
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module AndPig
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# Make a PigVar understand the struct it describes
|
|
6
|
+
#
|
|
7
|
+
class PigVar
|
|
8
|
+
attr_accessor :klass, :name, :cmd
|
|
9
|
+
cattr_accessor :working_dir ; self.working_dir = '.'
|
|
10
|
+
def initialize klass, name, cmd
|
|
11
|
+
self.klass = klass
|
|
12
|
+
self.name = name
|
|
13
|
+
self.cmd = cmd
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Sugar for PigVar.new_relation
|
|
17
|
+
def self.[]= name, *args
|
|
18
|
+
set name, *args
|
|
19
|
+
end
|
|
20
|
+
# Sugar for PigVar.new_relation
|
|
21
|
+
def self.[] name
|
|
22
|
+
PIG_SYMBOLS[name]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# extract a field from an alias
|
|
26
|
+
def _ field
|
|
27
|
+
as_name = [name, field].join("_").to_sym
|
|
28
|
+
AS["#{relationize}.(#{field})", as_name, Bag.new([field, field_type(field)]), nil, :skip_type]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def self.set name, rval
|
|
33
|
+
PIG_SYMBOLS[name] = rval
|
|
34
|
+
rval.name = name
|
|
35
|
+
emit_setter rval.relation, rval
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def relation
|
|
39
|
+
name.relationize
|
|
40
|
+
end
|
|
41
|
+
alias_method :relationize, :relation
|
|
42
|
+
|
|
43
|
+
#
|
|
44
|
+
# Create a name for a new anonymous relation
|
|
45
|
+
#
|
|
46
|
+
def self.anon slug
|
|
47
|
+
idx = (Wukong::AndPig.anon_var_idx += 1)
|
|
48
|
+
"anon_#{slug}_#{idx}_".to_sym
|
|
49
|
+
end
|
|
50
|
+
# Create a name building off this one
|
|
51
|
+
def anon
|
|
52
|
+
slug = name.to_s.gsub(/^anon_/,'').gsub(/_\d+_$/,'')
|
|
53
|
+
self.class.anon slug
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
#
|
|
57
|
+
def new_in_chain lval, l_klass, l_cmd
|
|
58
|
+
rval = self.class.new l_klass, lval, l_cmd
|
|
59
|
+
self.class.set lval, rval
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Delegate to klass
|
|
63
|
+
def field_type *args
|
|
64
|
+
self.klass.field_type *args
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Fields in this relation
|
|
68
|
+
def fields
|
|
69
|
+
klass.members.map(&:to_sym)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
#
|
|
73
|
+
# Side-effect free operation
|
|
74
|
+
#
|
|
75
|
+
def simple_operation op
|
|
76
|
+
self.class.emit "#{op.to_s.upcase} #{relation}"
|
|
77
|
+
self
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def self.simple_operation lval, rel, op, r_str
|
|
81
|
+
cmd = "%-8s %s" % [op.to_s.upcase, r_str]
|
|
82
|
+
rval = new(rel.klass, lval, cmd)
|
|
83
|
+
set lval, rval
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def self.simple_declaration op, r_str
|
|
87
|
+
cmd = "%-8s %s" % [op.to_s.upcase, r_str]
|
|
88
|
+
emit cmd
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module AndPig
|
|
3
|
+
PIG_SYMBOLS = { }
|
|
4
|
+
mattr_accessor :anon_var_idx
|
|
5
|
+
self.anon_var_idx = 0
|
|
6
|
+
end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
Symbol.class_eval do
|
|
11
|
+
def << relation
|
|
12
|
+
case
|
|
13
|
+
when relation.is_a?(Wukong::AndPig::PigVar)
|
|
14
|
+
Wukong::AndPig::PigVar.new_relation(self, relation)
|
|
15
|
+
when relation.is_a?(Symbol) && (pig_var = Wukong::AndPig::PIG_SYMBOLS[relation])
|
|
16
|
+
Wukong::AndPig::PigVar.new_relation(self, pig_var)
|
|
17
|
+
else raise "Don't know how to pigify RHS #{relation.inspect}"
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def method_missing method, *args
|
|
22
|
+
pig_var = Wukong::AndPig::PIG_SYMBOLS[self]
|
|
23
|
+
if pig_var && pig_var.respond_to?(method)
|
|
24
|
+
pig_var.send(method, *args)
|
|
25
|
+
else
|
|
26
|
+
super method, *args
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Easily serialize bad records in-band, for later analysis or to discard if
|
|
3
|
+
# neglectable.
|
|
4
|
+
#
|
|
5
|
+
# You can instantiate this as
|
|
6
|
+
# success = do_stuff_to record
|
|
7
|
+
# if ! success
|
|
8
|
+
# return BadRecord.new("do_stuff_to-failed", record)
|
|
9
|
+
# end
|
|
10
|
+
#
|
|
11
|
+
class BadRecord < Struct.new(
|
|
12
|
+
:errors,
|
|
13
|
+
:record
|
|
14
|
+
)
|
|
15
|
+
def initialize errors='', *record_fields
|
|
16
|
+
super errors, record_fields
|
|
17
|
+
end
|
|
18
|
+
end
|
data/lib/wukong/boot.rb
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
|
|
3
|
+
# ---------------------------------------------------------------------------
|
|
4
|
+
#
|
|
5
|
+
# Default options for Wukong
|
|
6
|
+
# http://github.com/infochimps/wukong
|
|
7
|
+
#
|
|
8
|
+
# If you set an environment variable WUKONG_CONFIG, *or* if the file
|
|
9
|
+
# $HOME/.wukong.rb exists, that file will be +require+'d as well.
|
|
10
|
+
#
|
|
11
|
+
# Important values to set:
|
|
12
|
+
#
|
|
13
|
+
# * Wukong::CONFIG[:hadoop_home] --
|
|
14
|
+
# Path to root of hadoop install. If your hadoop runner is
|
|
15
|
+
# /usr/local/share/hadoop/bin/hadoop
|
|
16
|
+
# then your hadoop_home is
|
|
17
|
+
# /usr/local/share/hadoop.
|
|
18
|
+
# You can also set a
|
|
19
|
+
#
|
|
20
|
+
# * Wukong::CONFIG[:default_run_mode] -- Whether to run using hadoop (and
|
|
21
|
+
# thus, requiring a working hadoop install), or to run in local mode
|
|
22
|
+
# (script --map | sort | script --reduce)
|
|
23
|
+
#
|
|
24
|
+
CONFIG = {
|
|
25
|
+
# Run as local or as hadoop?
|
|
26
|
+
:default_run_mode => 'hadoop',
|
|
27
|
+
|
|
28
|
+
# The command to run when a nil mapper or reducer is given.
|
|
29
|
+
:default_mapper => '/bin/cat',
|
|
30
|
+
:default_reducer => '/bin/cat',
|
|
31
|
+
|
|
32
|
+
# Anything in HADOOP_OPTIONS_MAP (see lib/wukong/script/hadoop_command.rb)
|
|
33
|
+
:runner_defaults => {
|
|
34
|
+
},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def self.config_options
|
|
38
|
+
# # override with site-specific options
|
|
39
|
+
site_config_filename = ENV['WUKONG_CONFIG'] || (ENV['HOME'].to_s+'/.wukong.rb')
|
|
40
|
+
require site_config_filename.gsub(/\.rb$/,'') if File.exists?(site_config_filename)
|
|
41
|
+
|
|
42
|
+
# try to guess a hadoop_home if none given
|
|
43
|
+
Wukong::CONFIG[:hadoop_home] ||= ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
|
|
44
|
+
end
|
|
45
|
+
self.config_options
|
|
46
|
+
end
|
|
47
|
+
|