wukong 0.1.4 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL.textile +89 -0
- data/README.textile +41 -74
- data/docpages/INSTALL.textile +94 -0
- data/{doc → docpages}/LICENSE.textile +0 -0
- data/{doc → docpages}/README-wulign.textile +6 -0
- data/docpages/UsingWukong-part1-get_ready.textile +17 -0
- data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
- data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
- data/docpages/_config.yml +39 -0
- data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
- data/{doc → docpages}/code/api_response_example.txt +0 -0
- data/{doc → docpages}/code/parser_skeleton.rb +0 -0
- data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +16 -0
- data/docpages/hadoop-tips.textile +83 -0
- data/docpages/index.textile +90 -0
- data/docpages/intro.textile +8 -0
- data/docpages/moreinfo.textile +174 -0
- data/docpages/news.html +24 -0
- data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
- data/docpages/tutorial.textile +283 -0
- data/docpages/usage.textile +195 -0
- data/docpages/wutils.textile +263 -0
- data/wukong.gemspec +80 -50
- metadata +87 -54
- data/doc/INSTALL.textile +0 -41
- data/doc/README-tutorial.textile +0 -163
- data/doc/README-wutils.textile +0 -128
- data/doc/TODO.textile +0 -61
- data/doc/UsingWukong-part1-setup.textile +0 -2
- data/doc/UsingWukong-part2-scraping.textile +0 -2
- data/doc/hadoop-nfs.textile +0 -51
- data/doc/hadoop-setup.textile +0 -29
- data/doc/index.textile +0 -124
- data/doc/links.textile +0 -42
- data/doc/usage.textile +0 -102
- data/doc/utils.textile +0 -48
- data/examples/and_pig/sample_queries.rb +0 -128
- data/lib/wukong/and_pig.rb +0 -62
- data/lib/wukong/and_pig/README.textile +0 -12
- data/lib/wukong/and_pig/as.rb +0 -37
- data/lib/wukong/and_pig/data_types.rb +0 -30
- data/lib/wukong/and_pig/functions.rb +0 -50
- data/lib/wukong/and_pig/generate.rb +0 -85
- data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
- data/lib/wukong/and_pig/junk.rb +0 -51
- data/lib/wukong/and_pig/operators.rb +0 -8
- data/lib/wukong/and_pig/operators/compound.rb +0 -29
- data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
- data/lib/wukong/and_pig/operators/execution.rb +0 -15
- data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
- data/lib/wukong/and_pig/operators/foreach.rb +0 -98
- data/lib/wukong/and_pig/operators/groupies.rb +0 -212
- data/lib/wukong/and_pig/operators/load_store.rb +0 -65
- data/lib/wukong/and_pig/operators/meta.rb +0 -42
- data/lib/wukong/and_pig/operators/relational.rb +0 -129
- data/lib/wukong/and_pig/pig_struct.rb +0 -48
- data/lib/wukong/and_pig/pig_var.rb +0 -95
- data/lib/wukong/and_pig/symbol.rb +0 -29
- data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,29 +0,0 @@
|
|
1
|
-
module Wukong
|
2
|
-
module AndPig
|
3
|
-
class PigVar
|
4
|
-
# ===========================================================================
|
5
|
-
#
|
6
|
-
# Pig expressions
|
7
|
-
#
|
8
|
-
|
9
|
-
#
|
10
|
-
def dfs cmd, filename
|
11
|
-
# note == no '' on path
|
12
|
-
self.class.emit "%-23s\t %s" % [cmd, filename]
|
13
|
-
end
|
14
|
-
#
|
15
|
-
# remove the stored file
|
16
|
-
#
|
17
|
-
def rmf! filename
|
18
|
-
dfs :rmf, filename
|
19
|
-
end
|
20
|
-
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
def mkdir filename
|
25
|
-
dfs :mkdir, filename
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
@@ -1,98 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# The FOREACH relational operator
|
3
|
-
#
|
4
|
-
module Wukong
|
5
|
-
module AndPig
|
6
|
-
class PigVar
|
7
|
-
|
8
|
-
# ===========================================================================
|
9
|
-
#
|
10
|
-
# FOREACH
|
11
|
-
#
|
12
|
-
def generate lval, *field_specs
|
13
|
-
gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
|
14
|
-
l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
|
15
|
-
l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
16
|
-
new_in_chain(lval, l_klass, l_cmd)
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# for a list of GENERATE args, we need
|
21
|
-
#
|
22
|
-
# * gen_clauses, the clause to stuff into the GENERATE line
|
23
|
-
# gen_expr AS gen_field_name: gen_field_type
|
24
|
-
#
|
25
|
-
# * new_types, the resulting types for each
|
26
|
-
#
|
27
|
-
# gen_expr common cases include
|
28
|
-
#
|
29
|
-
# field
|
30
|
-
# Rel::field
|
31
|
-
# Rel.(field)
|
32
|
-
# "ComplicatedExpression"
|
33
|
-
#
|
34
|
-
#
|
35
|
-
# field_attrs
|
36
|
-
#
|
37
|
-
#
|
38
|
-
def parse_gen_clause field_spec
|
39
|
-
case field_spec
|
40
|
-
when AS
|
41
|
-
field_spec
|
42
|
-
when Symbol
|
43
|
-
AS[field_spec, field_spec, field_type(field_spec)];
|
44
|
-
when Array
|
45
|
-
alias_in, field_in, name, type = field_spec
|
46
|
-
name ||= field_in
|
47
|
-
type = alias_in.field_type(field_in)
|
48
|
-
AS[field_in, name, type, alias_in.relationize]
|
49
|
-
when Hash
|
50
|
-
field_spec.map do |field_in, field_out|
|
51
|
-
AS[field_in, field_out, field_type(field_in)]
|
52
|
-
end
|
53
|
-
else raise "Don't know how to specify type for #{field_specs.inspect}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
# # when Array
|
67
|
-
# # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
|
68
|
-
# # field_expr, field_attr, field_type = field_spec
|
69
|
-
# # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
|
70
|
-
# # gen_clauses << "#{field_expr} AS #{field_as}"
|
71
|
-
# # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
|
72
|
-
|
73
|
-
# def prelimify *field_specs
|
74
|
-
# gen_clauses = []
|
75
|
-
# field_attrs = []
|
76
|
-
# field_specs.map do |field_spec|
|
77
|
-
# unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
|
78
|
-
# field_expr, field_attr = field_spec
|
79
|
-
# gen_clauses << "#{field_expr}"
|
80
|
-
# field_attrs += [field_attr].flatten
|
81
|
-
# end
|
82
|
-
# [ gen_clauses, field_attrs ]
|
83
|
-
# end
|
84
|
-
#
|
85
|
-
# # def generate *args
|
86
|
-
# # gen_clauses, field_attrs = self.class.fieldify *args
|
87
|
-
# # l_klass = TypedStruct.new(*field_attrs)
|
88
|
-
# # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
89
|
-
# # end
|
90
|
-
#
|
91
|
-
# def foreach *args
|
92
|
-
# generate_clause = args.pop
|
93
|
-
# prelim_exprs, prelim_attrs = prelimify *args
|
94
|
-
# prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
|
95
|
-
# gen_clauses, field_attrs = fieldify *generate_clause
|
96
|
-
# l_klass = TypedStruct.new(*field_attrs)
|
97
|
-
# new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
|
98
|
-
# end
|
@@ -1,212 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
#
|
3
|
-
# The FOREACH relational operator
|
4
|
-
#
|
5
|
-
module Wukong
|
6
|
-
module AndPig
|
7
|
-
class PigVar
|
8
|
-
|
9
|
-
#===========================================================================
|
10
|
-
#
|
11
|
-
# GROUP and COGROUP
|
12
|
-
#
|
13
|
-
|
14
|
-
#
|
15
|
-
# COGROUP - Groups the data in two or more relations.
|
16
|
-
#
|
17
|
-
# == Syntax
|
18
|
-
#
|
19
|
-
# alias = COGROUP alias1 BY field_alias [INNER | OUTER],
|
20
|
-
# aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
|
21
|
-
#
|
22
|
-
# == Structure
|
23
|
-
#
|
24
|
-
# { group, <structure of alias1>, <structure of alias2>, ... }
|
25
|
-
#
|
26
|
-
# == Terms
|
27
|
-
#
|
28
|
-
# * alias The name a relation.
|
29
|
-
#
|
30
|
-
# * field_alias The name of one or more fields in a relation. If multiple
|
31
|
-
# fields are specified, separate with commas and enclose
|
32
|
-
# in parentheses. For example, X = COGROUP A BY (f1, f2);
|
33
|
-
#
|
34
|
-
# The number of fields specified in each BY clause must
|
35
|
-
# match. For example, X = COGROUP A BY (a1,a2,a3), B BY
|
36
|
-
# (b1,b2,b3);
|
37
|
-
#
|
38
|
-
# * BY Keyword.
|
39
|
-
#
|
40
|
-
# * INNER Eliminate NULLs on that grouping
|
41
|
-
# * OUTER Do not eliminate NULLs on that grouping (default)
|
42
|
-
#
|
43
|
-
# * PARALLEL n -- Increase the parallelism of a job by specifying the
|
44
|
-
# number of reduce tasks, n. The optimal number of
|
45
|
-
# parallel tasks depends on the amount of memory on each
|
46
|
-
# node and the memory required by each of the tasks. To
|
47
|
-
# determine n, use the following as a general guideline:
|
48
|
-
#
|
49
|
-
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
50
|
-
#
|
51
|
-
# where nr_nodes is the number of nodes used and nr_GB is
|
52
|
-
# the amount of physical memory on each node.
|
53
|
-
#
|
54
|
-
# Note the following:
|
55
|
-
# - Parallel only affects the number of reduce tasks. Map
|
56
|
-
# parallelism is determined by the input file, one map
|
57
|
-
# for each HDFS block.
|
58
|
-
# - If you don’t specify parallel, you still get the same
|
59
|
-
# map parallelism but only one reduce task.
|
60
|
-
#
|
61
|
-
# == Usage
|
62
|
-
#
|
63
|
-
# The COGOUP operator groups the data in two or more relations based on
|
64
|
-
# the common field values.
|
65
|
-
#
|
66
|
-
# Note: The COGROUP and JOIN operators perform similar functions. COGROUP
|
67
|
-
# creates a nested set of output tuples while JOIN creates a flat set of
|
68
|
-
# output tuples with NULLs eliminated.
|
69
|
-
#
|
70
|
-
# == Examples
|
71
|
-
#
|
72
|
-
# Suppose we have two relations, A and B.
|
73
|
-
#
|
74
|
-
# A: (owner:chararray, pet:chararray)
|
75
|
-
# ---------------
|
76
|
-
# (Alice, cat)
|
77
|
-
# (Alice, goldfish)
|
78
|
-
# (Alice, turtle)
|
79
|
-
# (Bob, cat)
|
80
|
-
# (Bob, dog)
|
81
|
-
#
|
82
|
-
# B: (friend1:chararray, friend2:charrarray)
|
83
|
-
# ---------------------
|
84
|
-
# (Cindy, Alice)
|
85
|
-
# (Mark, Alice)
|
86
|
-
# (Paul, Bob)
|
87
|
-
# (Paul, Jane)
|
88
|
-
#
|
89
|
-
# In this example tuples are co-grouped using field “owner” from relation
|
90
|
-
# A and field “friend2” from relation B as the key fields. The DESCRIBE
|
91
|
-
# operator shows the schema for relation X, which has two fields, "group"
|
92
|
-
# and "A" (for an explanation, see GROUP).
|
93
|
-
#
|
94
|
-
# X = COGROUP A BY owner, B BY friend2;
|
95
|
-
# DESCRIBE X;
|
96
|
-
#
|
97
|
-
# X: {group: chararray,
|
98
|
-
# A: {owner: chararray,pet: chararray},
|
99
|
-
# B: {friend1: chararray,friend2: chararray}}
|
100
|
-
#
|
101
|
-
# Relation X looks like this. A tuple is created for each unique key
|
102
|
-
# field. The tuple includes the key field and two bags. The first bag is
|
103
|
-
# the tuples from the first relation with the matching key field. The
|
104
|
-
# second bag is the tuples from the second relation with the matching key
|
105
|
-
# field. If no tuples match the key field, the bag is empty.
|
106
|
-
#
|
107
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
108
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
109
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
110
|
-
# {(Paul, Bob)})
|
111
|
-
# (Jane, {},
|
112
|
-
# {(Paul, Jane)})
|
113
|
-
#
|
114
|
-
# In this example tuples are co-grouped and the INNER keyword is used to
|
115
|
-
# ensure that only bags with at least one tuple are returned.
|
116
|
-
#
|
117
|
-
# X = COGROUP A BY owner INNER, B BY friend2 INNER;
|
118
|
-
#
|
119
|
-
# Relation X looks like this.
|
120
|
-
#
|
121
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
122
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
123
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
124
|
-
# {(Paul, Bob)})
|
125
|
-
#
|
126
|
-
# In this example tuples are co-grouped and the INNER keyword is used
|
127
|
-
# asymmetrically on only one of the relations.
|
128
|
-
#
|
129
|
-
# X = COGROUP A BY owner, B BY friend2 INNER;
|
130
|
-
#
|
131
|
-
# Relation X looks like this.
|
132
|
-
#
|
133
|
-
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
134
|
-
# {(Cindy, Alice), (Mark, Alice)})
|
135
|
-
# (Bob, {(Bob, dog), (Bob, cat)},
|
136
|
-
# {(Paul, Bob)})
|
137
|
-
# (Jane, {},
|
138
|
-
# {(Paul, Jane)})
|
139
|
-
#
|
140
|
-
#
|
141
|
-
def group group_by
|
142
|
-
l_klass = l_klass_for_group group_by
|
143
|
-
by_clause = self.class.make_by_clause(group_by)
|
144
|
-
new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
|
145
|
-
end
|
146
|
-
|
147
|
-
def self.make_by_clause by_spec
|
148
|
-
case by_spec
|
149
|
-
when Array then 'BY ' + by_spec.join(", ")
|
150
|
-
when :all then 'ALL'
|
151
|
-
when Symbol then "BY #{by_spec}"
|
152
|
-
when String then by_spec
|
153
|
-
when Hash then make_by_clause(by_spec[:by])
|
154
|
-
else raise "Don't know how to group on #{by_spec.inspect}"
|
155
|
-
end
|
156
|
-
end
|
157
|
-
def types_for_fields field
|
158
|
-
klass.members_types[field]
|
159
|
-
end
|
160
|
-
def l_klass_for_group group_by
|
161
|
-
self.class.l_klass_for_group group_by, self
|
162
|
-
end
|
163
|
-
def self.l_klass_for_group group_by, *rels
|
164
|
-
TypedStruct.new(
|
165
|
-
[:group, rels.first.types_for_fields(group_by)],
|
166
|
-
*rels.map{|rel| [rel.relation, rel.klass] }
|
167
|
-
)
|
168
|
-
end
|
169
|
-
|
170
|
-
#
|
171
|
-
# COGROUP pig expression:
|
172
|
-
# UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
|
173
|
-
#
|
174
|
-
def self.cogroup lval, *by
|
175
|
-
by_clause = by.map do |relation, group_by, as|
|
176
|
-
"%s %s" % [relation.relation, make_by_clause(group_by)]
|
177
|
-
end.join(", ")
|
178
|
-
l_klass = l_klass_for_group by[0][1], *by.map(&:first)
|
179
|
-
rval = new l_klass, lval, "COGROUP #{by_clause}"
|
180
|
-
set lval, rval
|
181
|
-
end
|
182
|
-
|
183
|
-
def cogroup *args
|
184
|
-
self.class.cogroup self, *args
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
# ===========================================================================
|
189
|
-
#
|
190
|
-
# JOIN
|
191
|
-
#
|
192
|
-
def self.klass_from_join by
|
193
|
-
klasses = by.map(&:first)
|
194
|
-
TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
|
195
|
-
end
|
196
|
-
|
197
|
-
def self.join_by_clause by
|
198
|
-
by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
|
199
|
-
end
|
200
|
-
|
201
|
-
def self.join lval, by
|
202
|
-
parallel = by.delete(:parallel)
|
203
|
-
cmd = "JOIN " + join_by_clause(by)
|
204
|
-
parallelize! cmd, :parallel => parallel
|
205
|
-
l_klass = klass_from_join(by)
|
206
|
-
rval = new(l_klass, lval, cmd)
|
207
|
-
set lval, rval
|
208
|
-
end
|
209
|
-
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
@@ -1,65 +0,0 @@
|
|
1
|
-
# == Load/StoreFunctions ==
|
2
|
-
# BinaryDeserializer
|
3
|
-
# BinarySerializer
|
4
|
-
# BinStorage
|
5
|
-
# PigStorage
|
6
|
-
# PigDump
|
7
|
-
# TextLoader
|
8
|
-
|
9
|
-
module Wukong
|
10
|
-
module AndPig
|
11
|
-
class PigVar
|
12
|
-
#===========================================================================
|
13
|
-
#
|
14
|
-
# The "LOAD" pig expression:
|
15
|
-
# MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
|
16
|
-
#
|
17
|
-
# The AS type spec is generated from klass
|
18
|
-
#
|
19
|
-
def self.pig_load rel, klass, options={ }
|
20
|
-
filename = options[:filename] || default_filename(rel)
|
21
|
-
self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
|
22
|
-
if options[:has_rsrc]
|
23
|
-
lval = self[rel]
|
24
|
-
lval.generate lval, *lval.fields
|
25
|
-
end
|
26
|
-
rel
|
27
|
-
end
|
28
|
-
|
29
|
-
#===========================================================================
|
30
|
-
#
|
31
|
-
#
|
32
|
-
# The "STORE" pig imperative:
|
33
|
-
# STORE Relation INTO 'filename'
|
34
|
-
# If no filename is given, the relation's name is used
|
35
|
-
#
|
36
|
-
def store filename=nil
|
37
|
-
filename ||= default_filename
|
38
|
-
self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
|
39
|
-
self
|
40
|
-
end
|
41
|
-
|
42
|
-
# Store the relation, removing the existing file
|
43
|
-
def store! filename=nil
|
44
|
-
filename ||= default_filename
|
45
|
-
rmf! filename
|
46
|
-
mkdir File.dirname(filename)
|
47
|
-
store filename
|
48
|
-
end
|
49
|
-
|
50
|
-
# Force a store to disk, then load (so all calculations proceed from there)
|
51
|
-
def checkpoint! options={}
|
52
|
-
options = options.reverse_merge :filename => default_filename
|
53
|
-
store! options[:filename]
|
54
|
-
self.class.pig_load(self.name, klass, options)
|
55
|
-
end
|
56
|
-
|
57
|
-
def default_filename
|
58
|
-
self.class.default_filename self.name
|
59
|
-
end
|
60
|
-
def self.default_filename name
|
61
|
-
File.join(working_dir, name.to_s)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
# == DiagnosticOperators
|
2
|
-
# describe
|
3
|
-
# dump
|
4
|
-
# explain
|
5
|
-
# illustrate
|
6
|
-
# == UDFStatements
|
7
|
-
# define
|
8
|
-
# register
|
9
|
-
|
10
|
-
module Wukong
|
11
|
-
module AndPig
|
12
|
-
class PigVar
|
13
|
-
# DESCRIBE pig imperative
|
14
|
-
def describe
|
15
|
-
self.class.describe self
|
16
|
-
end
|
17
|
-
def self.describe rel
|
18
|
-
emit %Q{ -- PREDICTED: #{rel.klass.typify} }
|
19
|
-
simple_declaration :describe, rel.relationize
|
20
|
-
rel
|
21
|
-
end
|
22
|
-
|
23
|
-
# DUMP pig imperative
|
24
|
-
def dump() simple_operation :dump end
|
25
|
-
|
26
|
-
# EXPLAIN pig imperative
|
27
|
-
def explain() simple_operation :explain end
|
28
|
-
|
29
|
-
# ILLUSTRATE pig imperative
|
30
|
-
def illustrate() simple_operation :illustrate end
|
31
|
-
|
32
|
-
|
33
|
-
def self.define pig_alias, *args
|
34
|
-
emit_imperative :DEFINE, pig_alias, args
|
35
|
-
end
|
36
|
-
|
37
|
-
def self.register path_to_jar
|
38
|
-
emit_imperative :REGISTER, path_to_jar
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|