wukong 0.1.4 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL.textile +89 -0
- data/README.textile +41 -74
- data/docpages/INSTALL.textile +94 -0
- data/{doc → docpages}/LICENSE.textile +0 -0
- data/{doc → docpages}/README-wulign.textile +6 -0
- data/docpages/UsingWukong-part1-get_ready.textile +17 -0
- data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
- data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
- data/docpages/_config.yml +39 -0
- data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
- data/{doc → docpages}/code/api_response_example.txt +0 -0
- data/{doc → docpages}/code/parser_skeleton.rb +0 -0
- data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +16 -0
- data/docpages/hadoop-tips.textile +83 -0
- data/docpages/index.textile +90 -0
- data/docpages/intro.textile +8 -0
- data/docpages/moreinfo.textile +174 -0
- data/docpages/news.html +24 -0
- data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
- data/docpages/tutorial.textile +283 -0
- data/docpages/usage.textile +195 -0
- data/docpages/wutils.textile +263 -0
- data/wukong.gemspec +80 -50
- metadata +87 -54
- data/doc/INSTALL.textile +0 -41
- data/doc/README-tutorial.textile +0 -163
- data/doc/README-wutils.textile +0 -128
- data/doc/TODO.textile +0 -61
- data/doc/UsingWukong-part1-setup.textile +0 -2
- data/doc/UsingWukong-part2-scraping.textile +0 -2
- data/doc/hadoop-nfs.textile +0 -51
- data/doc/hadoop-setup.textile +0 -29
- data/doc/index.textile +0 -124
- data/doc/links.textile +0 -42
- data/doc/usage.textile +0 -102
- data/doc/utils.textile +0 -48
- data/examples/and_pig/sample_queries.rb +0 -128
- data/lib/wukong/and_pig.rb +0 -62
- data/lib/wukong/and_pig/README.textile +0 -12
- data/lib/wukong/and_pig/as.rb +0 -37
- data/lib/wukong/and_pig/data_types.rb +0 -30
- data/lib/wukong/and_pig/functions.rb +0 -50
- data/lib/wukong/and_pig/generate.rb +0 -85
- data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
- data/lib/wukong/and_pig/junk.rb +0 -51
- data/lib/wukong/and_pig/operators.rb +0 -8
- data/lib/wukong/and_pig/operators/compound.rb +0 -29
- data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
- data/lib/wukong/and_pig/operators/execution.rb +0 -15
- data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
- data/lib/wukong/and_pig/operators/foreach.rb +0 -98
- data/lib/wukong/and_pig/operators/groupies.rb +0 -212
- data/lib/wukong/and_pig/operators/load_store.rb +0 -65
- data/lib/wukong/and_pig/operators/meta.rb +0 -42
- data/lib/wukong/and_pig/operators/relational.rb +0 -129
- data/lib/wukong/and_pig/pig_struct.rb +0 -48
- data/lib/wukong/and_pig/pig_var.rb +0 -95
- data/lib/wukong/and_pig/symbol.rb +0 -29
- data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,29 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module AndPig
|
3
|
-
class PigVar
|
4
|
-
# ===========================================================================
|
5
|
-
#
|
6
|
-
# Pig expressions
|
7
|
-
#
|
8
|
-
|
9
|
-
#
|
10
|
-
def dfs cmd, filename
|
11
|
-
# note == no '' on path
|
12
|
-
self.class.emit "%-23s\t %s" % [cmd, filename]
|
13
|
-
end
|
14
|
-
#
|
15
|
-
# remove the stored file
|
16
|
-
#
|
17
|
-
def rmf! filename
|
18
|
-
dfs :rmf, filename
|
19
|
-
end
|
20
|
-
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
def mkdir filename
|
25
|
-
dfs :mkdir, filename
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# The FOREACH relational operator
|
3
|
-
#
|
4
|
-
module Wukong
|
5
|
-
module AndPig
|
6
|
-
class PigVar
|
7
|
-
|
8
|
-
# ===========================================================================
|
9
|
-
#
|
10
|
-
# FOREACH
|
11
|
-
#
|
12
|
-
def generate lval, *field_specs
|
13
|
-
gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
|
14
|
-
l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
|
15
|
-
l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
16
|
-
new_in_chain(lval, l_klass, l_cmd)
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# for a list of GENERATE args, we need
|
21
|
-
#
|
22
|
-
# * gen_clauses, the clause to stuff into the GENERATE line
|
23
|
-
# gen_expr AS gen_field_name: gen_field_type
|
24
|
-
#
|
25
|
-
# * new_types, the resulting types for each
|
26
|
-
#
|
27
|
-
# gen_expr common cases include
|
28
|
-
#
|
29
|
-
# field
|
30
|
-
# Rel::field
|
31
|
-
# Rel.(field)
|
32
|
-
# "ComplicatedExpression"
|
33
|
-
#
|
34
|
-
#
|
35
|
-
# field_attrs
|
36
|
-
#
|
37
|
-
#
|
38
|
-
def parse_gen_clause field_spec
|
39
|
-
case field_spec
|
40
|
-
when AS
|
41
|
-
field_spec
|
42
|
-
when Symbol
|
43
|
-
AS[field_spec, field_spec, field_type(field_spec)];
|
44
|
-
when Array
|
45
|
-
alias_in, field_in, name, type = field_spec
|
46
|
-
name ||= field_in
|
47
|
-
type = alias_in.field_type(field_in)
|
48
|
-
AS[field_in, name, type, alias_in.relationize]
|
49
|
-
when Hash
|
50
|
-
field_spec.map do |field_in, field_out|
|
51
|
-
AS[field_in, field_out, field_type(field_in)]
|
52
|
-
end
|
53
|
-
else raise "Don't know how to specify type for #{field_specs.inspect}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
# # when Array
|
67
|
-
# # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
|
68
|
-
# # field_expr, field_attr, field_type = field_spec
|
69
|
-
# # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
|
70
|
-
# # gen_clauses << "#{field_expr} AS #{field_as}"
|
71
|
-
# # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
|
72
|
-
|
73
|
-
# def prelimify *field_specs
|
74
|
-
# gen_clauses = []
|
75
|
-
# field_attrs = []
|
76
|
-
# field_specs.map do |field_spec|
|
77
|
-
# unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
|
78
|
-
# field_expr, field_attr = field_spec
|
79
|
-
# gen_clauses << "#{field_expr}"
|
80
|
-
# field_attrs += [field_attr].flatten
|
81
|
-
# end
|
82
|
-
# [ gen_clauses, field_attrs ]
|
83
|
-
# end
|
84
|
-
#
|
85
|
-
# # def generate *args
|
86
|
-
# # gen_clauses, field_attrs = self.class.fieldify *args
|
87
|
-
# # l_klass = TypedStruct.new(*field_attrs)
|
88
|
-
# # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
89
|
-
# # end
|
90
|
-
#
|
91
|
-
# def foreach *args
|
92
|
-
# generate_clause = args.pop
|
93
|
-
# prelim_exprs, prelim_attrs = prelimify *args
|
94
|
-
# prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
|
95
|
-
# gen_clauses, field_attrs = fieldify *generate_clause
|
96
|
-
# l_klass = TypedStruct.new(*field_attrs)
|
97
|
-
# new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
|
98
|
-
# end
|
@@ -1,212 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
#
|
3
|
-
# The FOREACH relational operator
|
4
|
-
#
|
5
|
-
module Wukong
|
6
|
-
module AndPig
|
7
|
-
class PigVar
|
8
|
-
|
9
|
-
#===========================================================================
|
10
|
-
#
|
11
|
-
# GROUP and COGROUP
|
12
|
-
#
|
13
|
-
|
14
|
-
#
|
15
|
-
# COGROUP - Groups the data in two or more relations.
|
16
|
-
#
|
17
|
-
# == Syntax
|
18
|
-
#
|
19
|
-
# alias = COGROUP alias1 BY field_alias [INNER | OUTER],
|
20
|
-
# aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
|
21
|
-
#
|
22
|
-
# == Structure
|
23
|
-
#
|
24
|
-
# { group, <structure of alias1>, <structure of alias2>, ... }
|
25
|
-
#
|
26
|
-
# == Terms
|
27
|
-
#
|
28
|
-
# * alias The name a relation.
|
29
|
-
#
|
30
|
-
# * field_alias The name of one or more fields in a relation. If multiple
|
31
|
-
# fields are specified, separate with commas and enclose
|
32
|
-
# in parentheses. For example, X = COGROUP A BY (f1, f2);
|
33
|
-
#
|
34
|
-
# The number of fields specified in each BY clause must
|
35
|
-
# match. For example, X = COGROUP A BY (a1,a2,a3), B BY
|
36
|
-
# (b1,b2,b3);
|
37
|
-
#
|
38
|
-
# * BY Keyword.
|
39
|
-
#
|
40
|
-
# * INNER Eliminate NULLs on that grouping
|
41
|
-
# * OUTER Do not eliminate NULLs on that grouping (default)
|
42
|
-
#
|
43
|
-
# * PARALLEL n -- Increase the parallelism of a job by specifying the
|
44
|
-
# number of reduce tasks, n. The optimal number of
|
45
|
-
# parallel tasks depends on the amount of memory on each
|
46
|
-
# node and the memory required by each of the tasks. To
|
47
|
-
# determine n, use the following as a general guideline:
|
48
|
-
#
|
49
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
50
|
-
#
|
51
|
-
# where nr_nodes is the number of nodes used and nr_GB is
|
52
|
-
# the amount of physical memory on each node.
|
53
|
-
#
|
54
|
-
# Note the following:
|
55
|
-
# - Parallel only affects the number of reduce tasks. Map
|
56
|
-
# parallelism is determined by the input file, one map
|
57
|
-
# for each HDFS block.
|
58
|
-
# - If you don’t specify parallel, you still get the same
|
59
|
-
# map parallelism but only one reduce task.
|
60
|
-
#
|
61
|
-
# == Usage
|
62
|
-
#
|
63
|
-
# The COGOUP operator groups the data in two or more relations based on
|
64
|
-
# the common field values.
|
65
|
-
#
|
66
|
-
# Note: The COGROUP and JOIN operators perform similar functions. COGROUP
|
67
|
-
# creates a nested set of output tuples while JOIN creates a flat set of
|
68
|
-
# output tuples with NULLs eliminated.
|
69
|
-
#
|
70
|
-
# == Examples
|
71
|
-
#
|
72
|
-
# Suppose we have two relations, A and B.
|
73
|
-
#
|
74
|
-
# A: (owner:chararray, pet:chararray)
|
75
|
-
# ---------------
|
76
|
-
# (Alice, cat)
|
77
|
-
# (Alice, goldfish)
|
78
|
-
# (Alice, turtle)
|
79
|
-
# (Bob, cat)
|
80
|
-
# (Bob, dog)
|
81
|
-
#
|
82
|
-
# B: (friend1:chararray, friend2:charrarray)
|
83
|
-
# ---------------------
|
84
|
-
# (Cindy, Alice)
|
85
|
-
# (Mark, Alice)
|
86
|
-
# (Paul, Bob)
|
87
|
-
# (Paul, Jane)
|
88
|
-
#
|
89
|
-
# In this example tuples are co-grouped using field “owner” from relation
|
90
|
-
# A and field “friend2” from relation B as the key fields. The DESCRIBE
|
91
|
-
# operator shows the schema for relation X, which has two fields, "group"
|
92
|
-
# and "A" (for an explanation, see GROUP).
|
93
|
-
#
|
94
|
-
# X = COGROUP A BY owner, B BY friend2;
|
95
|
-
# DESCRIBE X;
|
96
|
-
#
|
97
|
-
# X: {group: chararray,
|
98
|
-
# A: {owner: chararray,pet: chararray},
|
99
|
-
# B: {friend1: chararray,friend2: chararray}}
|
100
|
-
#
|
101
|
-
# Relation X looks like this. A tuple is created for each unique key
|
102
|
-
# field. The tuple includes the key field and two bags. The first bag is
|
103
|
-
# the tuples from the first relation with the matching key field. The
|
104
|
-
# second bag is the tuples from the second relation with the matching key
|
105
|
-
# field. If no tuples match the key field, the bag is empty.
|
106
|
-
#
|
107
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
108
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
109
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
110
|
-
# {(Paul, Bob)})
|
111
|
-
# (Jane, {},
|
112
|
-
# {(Paul, Jane)})
|
113
|
-
#
|
114
|
-
# In this example tuples are co-grouped and the INNER keyword is used to
|
115
|
-
# ensure that only bags with at least one tuple are returned.
|
116
|
-
#
|
117
|
-
# X = COGROUP A BY owner INNER, B BY friend2 INNER;
|
118
|
-
#
|
119
|
-
# Relation X looks like this.
|
120
|
-
#
|
121
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
122
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
123
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
124
|
-
# {(Paul, Bob)})
|
125
|
-
#
|
126
|
-
# In this example tuples are co-grouped and the INNER keyword is used
|
127
|
-
# asymmetrically on only one of the relations.
|
128
|
-
#
|
129
|
-
# X = COGROUP A BY owner, B BY friend2 INNER;
|
130
|
-
#
|
131
|
-
# Relation X looks like this.
|
132
|
-
#
|
133
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
134
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
135
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
136
|
-
# {(Paul, Bob)})
|
137
|
-
# (Jane, {},
|
138
|
-
# {(Paul, Jane)})
|
139
|
-
#
|
140
|
-
#
|
141
|
-
def group group_by
|
142
|
-
l_klass = l_klass_for_group group_by
|
143
|
-
by_clause = self.class.make_by_clause(group_by)
|
144
|
-
new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
|
145
|
-
end
|
146
|
-
|
147
|
-
def self.make_by_clause by_spec
|
148
|
-
case by_spec
|
149
|
-
when Array then 'BY ' + by_spec.join(", ")
|
150
|
-
when :all then 'ALL'
|
151
|
-
when Symbol then "BY #{by_spec}"
|
152
|
-
when String then by_spec
|
153
|
-
when Hash then make_by_clause(by_spec[:by])
|
154
|
-
else raise "Don't know how to group on #{by_spec.inspect}"
|
155
|
-
end
|
156
|
-
end
|
157
|
-
def types_for_fields field
|
158
|
-
klass.members_types[field]
|
159
|
-
end
|
160
|
-
def l_klass_for_group group_by
|
161
|
-
self.class.l_klass_for_group group_by, self
|
162
|
-
end
|
163
|
-
def self.l_klass_for_group group_by, *rels
|
164
|
-
TypedStruct.new(
|
165
|
-
[:group, rels.first.types_for_fields(group_by)],
|
166
|
-
*rels.map{|rel| [rel.relation, rel.klass] }
|
167
|
-
)
|
168
|
-
end
|
169
|
-
|
170
|
-
#
|
171
|
-
# COGROUP pig expression:
|
172
|
-
# UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
|
173
|
-
#
|
174
|
-
def self.cogroup lval, *by
|
175
|
-
by_clause = by.map do |relation, group_by, as|
|
176
|
-
"%s %s" % [relation.relation, make_by_clause(group_by)]
|
177
|
-
end.join(", ")
|
178
|
-
l_klass = l_klass_for_group by[0][1], *by.map(&:first)
|
179
|
-
rval = new l_klass, lval, "COGROUP #{by_clause}"
|
180
|
-
set lval, rval
|
181
|
-
end
|
182
|
-
|
183
|
-
def cogroup *args
|
184
|
-
self.class.cogroup self, *args
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
# ===========================================================================
|
189
|
-
#
|
190
|
-
# JOIN
|
191
|
-
#
|
192
|
-
def self.klass_from_join by
|
193
|
-
klasses = by.map(&:first)
|
194
|
-
TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
|
195
|
-
end
|
196
|
-
|
197
|
-
def self.join_by_clause by
|
198
|
-
by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
|
199
|
-
end
|
200
|
-
|
201
|
-
def self.join lval, by
|
202
|
-
parallel = by.delete(:parallel)
|
203
|
-
cmd = "JOIN " + join_by_clause(by)
|
204
|
-
parallelize! cmd, :parallel => parallel
|
205
|
-
l_klass = klass_from_join(by)
|
206
|
-
rval = new(l_klass, lval, cmd)
|
207
|
-
set lval, rval
|
208
|
-
end
|
209
|
-
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
@@ -1,65 +0,0 @@
|
|
1
|
-
# == Load/StoreFunctions ==
|
2
|
-
# BinaryDeserializer
|
3
|
-
# BinarySerializer
|
4
|
-
# BinStorage
|
5
|
-
# PigStorage
|
6
|
-
# PigDump
|
7
|
-
# TextLoader
|
8
|
-
|
9
|
-
module Wukong
|
10
|
-
module AndPig
|
11
|
-
class PigVar
|
12
|
-
#===========================================================================
|
13
|
-
#
|
14
|
-
# The "LOAD" pig expression:
|
15
|
-
# MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
|
16
|
-
#
|
17
|
-
# The AS type spec is generated from klass
|
18
|
-
#
|
19
|
-
def self.pig_load rel, klass, options={ }
|
20
|
-
filename = options[:filename] || default_filename(rel)
|
21
|
-
self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
|
22
|
-
if options[:has_rsrc]
|
23
|
-
lval = self[rel]
|
24
|
-
lval.generate lval, *lval.fields
|
25
|
-
end
|
26
|
-
rel
|
27
|
-
end
|
28
|
-
|
29
|
-
#===========================================================================
|
30
|
-
#
|
31
|
-
#
|
32
|
-
# The "STORE" pig imperative:
|
33
|
-
# STORE Relation INTO 'filename'
|
34
|
-
# If no filename is given, the relation's name is used
|
35
|
-
#
|
36
|
-
def store filename=nil
|
37
|
-
filename ||= default_filename
|
38
|
-
self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
|
39
|
-
self
|
40
|
-
end
|
41
|
-
|
42
|
-
# Store the relation, removing the existing file
|
43
|
-
def store! filename=nil
|
44
|
-
filename ||= default_filename
|
45
|
-
rmf! filename
|
46
|
-
mkdir File.dirname(filename)
|
47
|
-
store filename
|
48
|
-
end
|
49
|
-
|
50
|
-
# Force a store to disk, then load (so all calculations proceed from there)
|
51
|
-
def checkpoint! options={}
|
52
|
-
options = options.reverse_merge :filename => default_filename
|
53
|
-
store! options[:filename]
|
54
|
-
self.class.pig_load(self.name, klass, options)
|
55
|
-
end
|
56
|
-
|
57
|
-
def default_filename
|
58
|
-
self.class.default_filename self.name
|
59
|
-
end
|
60
|
-
def self.default_filename name
|
61
|
-
File.join(working_dir, name.to_s)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
# == DiagnosticOperators
|
2
|
-
# describe
|
3
|
-
# dump
|
4
|
-
# explain
|
5
|
-
# illustrate
|
6
|
-
# == UDFStatements
|
7
|
-
# define
|
8
|
-
# register
|
9
|
-
|
10
|
-
module Wukong
|
11
|
-
module AndPig
|
12
|
-
class PigVar
|
13
|
-
# DESCRIBE pig imperative
|
14
|
-
def describe
|
15
|
-
self.class.describe self
|
16
|
-
end
|
17
|
-
def self.describe rel
|
18
|
-
emit %Q{ -- PREDICTED: #{rel.klass.typify} }
|
19
|
-
simple_declaration :describe, rel.relationize
|
20
|
-
rel
|
21
|
-
end
|
22
|
-
|
23
|
-
# DUMP pig imperative
|
24
|
-
def dump() simple_operation :dump end
|
25
|
-
|
26
|
-
# EXPLAIN pig imperative
|
27
|
-
def explain() simple_operation :explain end
|
28
|
-
|
29
|
-
# ILLUSTRATE pig imperative
|
30
|
-
def illustrate() simple_operation :illustrate end
|
31
|
-
|
32
|
-
|
33
|
-
def self.define pig_alias, *args
|
34
|
-
emit_imperative :DEFINE, pig_alias, args
|
35
|
-
end
|
36
|
-
|
37
|
-
def self.register path_to_jar
|
38
|
-
emit_imperative :REGISTER, path_to_jar
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|