wukong 0.1.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,7 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
-
5
- end
6
- end
7
- end
@@ -1,15 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
-
5
- # ===========================================================================
6
- #
7
- # STREAM
8
- #
9
- def stream options={}
10
- new_in_chain klass, "STREAM #{relation}"
11
- end
12
- end
13
- end
14
- end
15
-
@@ -1,29 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
- # ===========================================================================
5
- #
6
- # Pig expressions
7
- #
8
-
9
- #
10
- def dfs cmd, filename
11
- # note == no '' on path
12
- self.class.emit "%-23s\t %s" % [cmd, filename]
13
- end
14
- #
15
- # remove the stored file
16
- #
17
- def rmf! filename
18
- dfs :rmf, filename
19
- end
20
-
21
- #
22
- #
23
- #
24
- def mkdir filename
25
- dfs :mkdir, filename
26
- end
27
- end
28
- end
29
- end
@@ -1,98 +0,0 @@
1
- #
2
- # The FOREACH relational operator
3
- #
4
- module Wukong
5
- module AndPig
6
- class PigVar
7
-
8
- # ===========================================================================
9
- #
10
- # FOREACH
11
- #
12
- def generate lval, *field_specs
13
- gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
14
- l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
15
- l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
16
- new_in_chain(lval, l_klass, l_cmd)
17
- end
18
-
19
- #
20
- # for a list of GENERATE args, we need
21
- #
22
- # * gen_clauses, the clause to stuff into the GENERATE line
23
- # gen_expr AS gen_field_name: gen_field_type
24
- #
25
- # * new_types, the resulting types for each
26
- #
27
- # gen_expr common cases include
28
- #
29
- # field
30
- # Rel::field
31
- # Rel.(field)
32
- # "ComplicatedExpression"
33
- #
34
- #
35
- # field_attrs
36
- #
37
- #
38
- def parse_gen_clause field_spec
39
- case field_spec
40
- when AS
41
- field_spec
42
- when Symbol
43
- AS[field_spec, field_spec, field_type(field_spec)];
44
- when Array
45
- alias_in, field_in, name, type = field_spec
46
- name ||= field_in
47
- type = alias_in.field_type(field_in)
48
- AS[field_in, name, type, alias_in.relationize]
49
- when Hash
50
- field_spec.map do |field_in, field_out|
51
- AS[field_in, field_out, field_type(field_in)]
52
- end
53
- else raise "Don't know how to specify type for #{field_specs.inspect}"
54
- end
55
- end
56
- end
57
- end
58
- end
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
- # # when Array
67
- # # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
68
- # # field_expr, field_attr, field_type = field_spec
69
- # # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
70
- # # gen_clauses << "#{field_expr} AS #{field_as}"
71
- # # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
72
-
73
- # def prelimify *field_specs
74
- # gen_clauses = []
75
- # field_attrs = []
76
- # field_specs.map do |field_spec|
77
- # unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
78
- # field_expr, field_attr = field_spec
79
- # gen_clauses << "#{field_expr}"
80
- # field_attrs += [field_attr].flatten
81
- # end
82
- # [ gen_clauses, field_attrs ]
83
- # end
84
- #
85
- # # def generate *args
86
- # # gen_clauses, field_attrs = self.class.fieldify *args
87
- # # l_klass = TypedStruct.new(*field_attrs)
88
- # # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
89
- # # end
90
- #
91
- # def foreach *args
92
- # generate_clause = args.pop
93
- # prelim_exprs, prelim_attrs = prelimify *args
94
- # prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
95
- # gen_clauses, field_attrs = fieldify *generate_clause
96
- # l_klass = TypedStruct.new(*field_attrs)
97
- # new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
98
- # end
@@ -1,212 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- #
3
- # The FOREACH relational operator
4
- #
5
- module Wukong
6
- module AndPig
7
- class PigVar
8
-
9
- #===========================================================================
10
- #
11
- # GROUP and COGROUP
12
- #
13
-
14
- #
15
- # COGROUP - Groups the data in two or more relations.
16
- #
17
- # == Syntax
18
- #
19
- # alias = COGROUP alias1 BY field_alias [INNER | OUTER],
20
- # aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
21
- #
22
- # == Structure
23
- #
24
- # { group, <structure of alias1>, <structure of alias2>, ... }
25
- #
26
- # == Terms
27
- #
28
- # * alias The name a relation.
29
- #
30
- # * field_alias The name of one or more fields in a relation. If multiple
31
- # fields are specified, separate with commas and enclose
32
- # in parentheses. For example, X = COGROUP A BY (f1, f2);
33
- #
34
- # The number of fields specified in each BY clause must
35
- # match. For example, X = COGROUP A BY (a1,a2,a3), B BY
36
- # (b1,b2,b3);
37
- #
38
- # * BY Keyword.
39
- #
40
- # * INNER Eliminate NULLs on that grouping
41
- # * OUTER Do not eliminate NULLs on that grouping (default)
42
- #
43
- # * PARALLEL n -- Increase the parallelism of a job by specifying the
44
- # number of reduce tasks, n. The optimal number of
45
- # parallel tasks depends on the amount of memory on each
46
- # node and the memory required by each of the tasks. To
47
- # determine n, use the following as a general guideline:
48
- #
49
- # n = (nr_nodes - 1) * 0.45 * nr_GB
50
- #
51
- # where nr_nodes is the number of nodes used and nr_GB is
52
- # the amount of physical memory on each node.
53
- #
54
- # Note the following:
55
- # - Parallel only affects the number of reduce tasks. Map
56
- # parallelism is determined by the input file, one map
57
- # for each HDFS block.
58
- # - If you don’t specify parallel, you still get the same
59
- # map parallelism but only one reduce task.
60
- #
61
- # == Usage
62
- #
63
- # The COGOUP operator groups the data in two or more relations based on
64
- # the common field values.
65
- #
66
- # Note: The COGROUP and JOIN operators perform similar functions. COGROUP
67
- # creates a nested set of output tuples while JOIN creates a flat set of
68
- # output tuples with NULLs eliminated.
69
- #
70
- # == Examples
71
- #
72
- # Suppose we have two relations, A and B.
73
- #
74
- # A: (owner:chararray, pet:chararray)
75
- # ---------------
76
- # (Alice, cat)
77
- # (Alice, goldfish)
78
- # (Alice, turtle)
79
- # (Bob, cat)
80
- # (Bob, dog)
81
- #
82
- # B: (friend1:chararray, friend2:charrarray)
83
- # ---------------------
84
- # (Cindy, Alice)
85
- # (Mark, Alice)
86
- # (Paul, Bob)
87
- # (Paul, Jane)
88
- #
89
- # In this example tuples are co-grouped using field “owner” from relation
90
- # A and field “friend2” from relation B as the key fields. The DESCRIBE
91
- # operator shows the schema for relation X, which has two fields, "group"
92
- # and "A" (for an explanation, see GROUP).
93
- #
94
- # X = COGROUP A BY owner, B BY friend2;
95
- # DESCRIBE X;
96
- #
97
- # X: {group: chararray,
98
- # A: {owner: chararray,pet: chararray},
99
- # B: {friend1: chararray,friend2: chararray}}
100
- #
101
- # Relation X looks like this. A tuple is created for each unique key
102
- # field. The tuple includes the key field and two bags. The first bag is
103
- # the tuples from the first relation with the matching key field. The
104
- # second bag is the tuples from the second relation with the matching key
105
- # field. If no tuples match the key field, the bag is empty.
106
- #
107
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
108
- # {(Cindy, Alice), (Mark, Alice)})
109
- # (Bob, {(Bob, dog), (Bob, cat)},
110
- # {(Paul, Bob)})
111
- # (Jane, {},
112
- # {(Paul, Jane)})
113
- #
114
- # In this example tuples are co-grouped and the INNER keyword is used to
115
- # ensure that only bags with at least one tuple are returned.
116
- #
117
- # X = COGROUP A BY owner INNER, B BY friend2 INNER;
118
- #
119
- # Relation X looks like this.
120
- #
121
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
122
- # {(Cindy, Alice), (Mark, Alice)})
123
- # (Bob, {(Bob, dog), (Bob, cat)},
124
- # {(Paul, Bob)})
125
- #
126
- # In this example tuples are co-grouped and the INNER keyword is used
127
- # asymmetrically on only one of the relations.
128
- #
129
- # X = COGROUP A BY owner, B BY friend2 INNER;
130
- #
131
- # Relation X looks like this.
132
- #
133
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
134
- # {(Cindy, Alice), (Mark, Alice)})
135
- # (Bob, {(Bob, dog), (Bob, cat)},
136
- # {(Paul, Bob)})
137
- # (Jane, {},
138
- # {(Paul, Jane)})
139
- #
140
- #
141
- def group group_by
142
- l_klass = l_klass_for_group group_by
143
- by_clause = self.class.make_by_clause(group_by)
144
- new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
145
- end
146
-
147
- def self.make_by_clause by_spec
148
- case by_spec
149
- when Array then 'BY ' + by_spec.join(", ")
150
- when :all then 'ALL'
151
- when Symbol then "BY #{by_spec}"
152
- when String then by_spec
153
- when Hash then make_by_clause(by_spec[:by])
154
- else raise "Don't know how to group on #{by_spec.inspect}"
155
- end
156
- end
157
- def types_for_fields field
158
- klass.members_types[field]
159
- end
160
- def l_klass_for_group group_by
161
- self.class.l_klass_for_group group_by, self
162
- end
163
- def self.l_klass_for_group group_by, *rels
164
- TypedStruct.new(
165
- [:group, rels.first.types_for_fields(group_by)],
166
- *rels.map{|rel| [rel.relation, rel.klass] }
167
- )
168
- end
169
-
170
- #
171
- # COGROUP pig expression:
172
- # UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
173
- #
174
- def self.cogroup lval, *by
175
- by_clause = by.map do |relation, group_by, as|
176
- "%s %s" % [relation.relation, make_by_clause(group_by)]
177
- end.join(", ")
178
- l_klass = l_klass_for_group by[0][1], *by.map(&:first)
179
- rval = new l_klass, lval, "COGROUP #{by_clause}"
180
- set lval, rval
181
- end
182
-
183
- def cogroup *args
184
- self.class.cogroup self, *args
185
- end
186
-
187
-
188
- # ===========================================================================
189
- #
190
- # JOIN
191
- #
192
- def self.klass_from_join by
193
- klasses = by.map(&:first)
194
- TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
195
- end
196
-
197
- def self.join_by_clause by
198
- by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
199
- end
200
-
201
- def self.join lval, by
202
- parallel = by.delete(:parallel)
203
- cmd = "JOIN " + join_by_clause(by)
204
- parallelize! cmd, :parallel => parallel
205
- l_klass = klass_from_join(by)
206
- rval = new(l_klass, lval, cmd)
207
- set lval, rval
208
- end
209
-
210
- end
211
- end
212
- end
@@ -1,65 +0,0 @@
1
- # == Load/StoreFunctions ==
2
- # BinaryDeserializer
3
- # BinarySerializer
4
- # BinStorage
5
- # PigStorage
6
- # PigDump
7
- # TextLoader
8
-
9
- module Wukong
10
- module AndPig
11
- class PigVar
12
- #===========================================================================
13
- #
14
- # The "LOAD" pig expression:
15
- # MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
16
- #
17
- # The AS type spec is generated from klass
18
- #
19
- def self.pig_load rel, klass, options={ }
20
- filename = options[:filename] || default_filename(rel)
21
- self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
22
- if options[:has_rsrc]
23
- lval = self[rel]
24
- lval.generate lval, *lval.fields
25
- end
26
- rel
27
- end
28
-
29
- #===========================================================================
30
- #
31
- #
32
- # The "STORE" pig imperative:
33
- # STORE Relation INTO 'filename'
34
- # If no filename is given, the relation's name is used
35
- #
36
- def store filename=nil
37
- filename ||= default_filename
38
- self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
39
- self
40
- end
41
-
42
- # Store the relation, removing the existing file
43
- def store! filename=nil
44
- filename ||= default_filename
45
- rmf! filename
46
- mkdir File.dirname(filename)
47
- store filename
48
- end
49
-
50
- # Force a store to disk, then load (so all calculations proceed from there)
51
- def checkpoint! options={}
52
- options = options.reverse_merge :filename => default_filename
53
- store! options[:filename]
54
- self.class.pig_load(self.name, klass, options)
55
- end
56
-
57
- def default_filename
58
- self.class.default_filename self.name
59
- end
60
- def self.default_filename name
61
- File.join(working_dir, name.to_s)
62
- end
63
- end
64
- end
65
- end
@@ -1,42 +0,0 @@
1
- # == DiagnosticOperators
2
- # describe
3
- # dump
4
- # explain
5
- # illustrate
6
- # == UDFStatements
7
- # define
8
- # register
9
-
10
- module Wukong
11
- module AndPig
12
- class PigVar
13
- # DESCRIBE pig imperative
14
- def describe
15
- self.class.describe self
16
- end
17
- def self.describe rel
18
- emit %Q{ -- PREDICTED: #{rel.klass.typify} }
19
- simple_declaration :describe, rel.relationize
20
- rel
21
- end
22
-
23
- # DUMP pig imperative
24
- def dump() simple_operation :dump end
25
-
26
- # EXPLAIN pig imperative
27
- def explain() simple_operation :explain end
28
-
29
- # ILLUSTRATE pig imperative
30
- def illustrate() simple_operation :illustrate end
31
-
32
-
33
- def self.define pig_alias, *args
34
- emit_imperative :DEFINE, pig_alias, args
35
- end
36
-
37
- def self.register path_to_jar
38
- emit_imperative :REGISTER, path_to_jar
39
- end
40
- end
41
- end
42
- end