wukong 0.1.4 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,7 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
-
5
- end
6
- end
7
- end
@@ -1,15 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
-
5
- # ===========================================================================
6
- #
7
- # STREAM
8
- #
9
- def stream options={}
10
- new_in_chain klass, "STREAM #{relation}"
11
- end
12
- end
13
- end
14
- end
15
-
@@ -1,29 +0,0 @@
1
- module Wukong
2
- module AndPig
3
- class PigVar
4
- # ===========================================================================
5
- #
6
- # Pig expressions
7
- #
8
-
9
- #
10
- def dfs cmd, filename
11
- # note == no '' on path
12
- self.class.emit "%-23s\t %s" % [cmd, filename]
13
- end
14
- #
15
- # remove the stored file
16
- #
17
- def rmf! filename
18
- dfs :rmf, filename
19
- end
20
-
21
- #
22
- #
23
- #
24
- def mkdir filename
25
- dfs :mkdir, filename
26
- end
27
- end
28
- end
29
- end
@@ -1,98 +0,0 @@
1
- #
2
- # The FOREACH relational operator
3
- #
4
- module Wukong
5
- module AndPig
6
- class PigVar
7
-
8
- # ===========================================================================
9
- #
10
- # FOREACH
11
- #
12
- def generate lval, *field_specs
13
- gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
14
- l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
15
- l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
16
- new_in_chain(lval, l_klass, l_cmd)
17
- end
18
-
19
- #
20
- # for a list of GENERATE args, we need
21
- #
22
- # * gen_clauses, the clause to stuff into the GENERATE line
23
- # gen_expr AS gen_field_name: gen_field_type
24
- #
25
- # * new_types, the resulting types for each
26
- #
27
- # gen_expr common cases include
28
- #
29
- # field
30
- # Rel::field
31
- # Rel.(field)
32
- # "ComplicatedExpression"
33
- #
34
- #
35
- # field_attrs
36
- #
37
- #
38
- def parse_gen_clause field_spec
39
- case field_spec
40
- when AS
41
- field_spec
42
- when Symbol
43
- AS[field_spec, field_spec, field_type(field_spec)];
44
- when Array
45
- alias_in, field_in, name, type = field_spec
46
- name ||= field_in
47
- type = alias_in.field_type(field_in)
48
- AS[field_in, name, type, alias_in.relationize]
49
- when Hash
50
- field_spec.map do |field_in, field_out|
51
- AS[field_in, field_out, field_type(field_in)]
52
- end
53
- else raise "Don't know how to specify type for #{field_specs.inspect}"
54
- end
55
- end
56
- end
57
- end
58
- end
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
- # # when Array
67
- # # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
68
- # # field_expr, field_attr, field_type = field_spec
69
- # # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
70
- # # gen_clauses << "#{field_expr} AS #{field_as}"
71
- # # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
72
-
73
- # def prelimify *field_specs
74
- # gen_clauses = []
75
- # field_attrs = []
76
- # field_specs.map do |field_spec|
77
- # unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
78
- # field_expr, field_attr = field_spec
79
- # gen_clauses << "#{field_expr}"
80
- # field_attrs += [field_attr].flatten
81
- # end
82
- # [ gen_clauses, field_attrs ]
83
- # end
84
- #
85
- # # def generate *args
86
- # # gen_clauses, field_attrs = self.class.fieldify *args
87
- # # l_klass = TypedStruct.new(*field_attrs)
88
- # # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
89
- # # end
90
- #
91
- # def foreach *args
92
- # generate_clause = args.pop
93
- # prelim_exprs, prelim_attrs = prelimify *args
94
- # prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
95
- # gen_clauses, field_attrs = fieldify *generate_clause
96
- # l_klass = TypedStruct.new(*field_attrs)
97
- # new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
98
- # end
@@ -1,212 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- #
3
- # The FOREACH relational operator
4
- #
5
- module Wukong
6
- module AndPig
7
- class PigVar
8
-
9
- #===========================================================================
10
- #
11
- # GROUP and COGROUP
12
- #
13
-
14
- #
15
- # COGROUP - Groups the data in two or more relations.
16
- #
17
- # == Syntax
18
- #
19
- # alias = COGROUP alias1 BY field_alias [INNER | OUTER],
20
- # aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
21
- #
22
- # == Structure
23
- #
24
- # { group, <structure of alias1>, <structure of alias2>, ... }
25
- #
26
- # == Terms
27
- #
28
- # * alias The name a relation.
29
- #
30
- # * field_alias The name of one or more fields in a relation. If multiple
31
- # fields are specified, separate with commas and enclose
32
- # in parentheses. For example, X = COGROUP A BY (f1, f2);
33
- #
34
- # The number of fields specified in each BY clause must
35
- # match. For example, X = COGROUP A BY (a1,a2,a3), B BY
36
- # (b1,b2,b3);
37
- #
38
- # * BY Keyword.
39
- #
40
- # * INNER Eliminate NULLs on that grouping
41
- # * OUTER Do not eliminate NULLs on that grouping (default)
42
- #
43
- # * PARALLEL n -- Increase the parallelism of a job by specifying the
44
- # number of reduce tasks, n. The optimal number of
45
- # parallel tasks depends on the amount of memory on each
46
- # node and the memory required by each of the tasks. To
47
- # determine n, use the following as a general guideline:
48
- #
49
- # n = (nr_nodes - 1) * 0.45 * nr_GB
50
- #
51
- # where nr_nodes is the number of nodes used and nr_GB is
52
- # the amount of physical memory on each node.
53
- #
54
- # Note the following:
55
- # - Parallel only affects the number of reduce tasks. Map
56
- # parallelism is determined by the input file, one map
57
- # for each HDFS block.
58
- # - If you don’t specify parallel, you still get the same
59
- # map parallelism but only one reduce task.
60
- #
61
- # == Usage
62
- #
63
- # The COGOUP operator groups the data in two or more relations based on
64
- # the common field values.
65
- #
66
- # Note: The COGROUP and JOIN operators perform similar functions. COGROUP
67
- # creates a nested set of output tuples while JOIN creates a flat set of
68
- # output tuples with NULLs eliminated.
69
- #
70
- # == Examples
71
- #
72
- # Suppose we have two relations, A and B.
73
- #
74
- # A: (owner:chararray, pet:chararray)
75
- # ---------------
76
- # (Alice, cat)
77
- # (Alice, goldfish)
78
- # (Alice, turtle)
79
- # (Bob, cat)
80
- # (Bob, dog)
81
- #
82
- # B: (friend1:chararray, friend2:charrarray)
83
- # ---------------------
84
- # (Cindy, Alice)
85
- # (Mark, Alice)
86
- # (Paul, Bob)
87
- # (Paul, Jane)
88
- #
89
- # In this example tuples are co-grouped using field “owner” from relation
90
- # A and field “friend2” from relation B as the key fields. The DESCRIBE
91
- # operator shows the schema for relation X, which has two fields, "group"
92
- # and "A" (for an explanation, see GROUP).
93
- #
94
- # X = COGROUP A BY owner, B BY friend2;
95
- # DESCRIBE X;
96
- #
97
- # X: {group: chararray,
98
- # A: {owner: chararray,pet: chararray},
99
- # B: {friend1: chararray,friend2: chararray}}
100
- #
101
- # Relation X looks like this. A tuple is created for each unique key
102
- # field. The tuple includes the key field and two bags. The first bag is
103
- # the tuples from the first relation with the matching key field. The
104
- # second bag is the tuples from the second relation with the matching key
105
- # field. If no tuples match the key field, the bag is empty.
106
- #
107
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
108
- # {(Cindy, Alice), (Mark, Alice)})
109
- # (Bob, {(Bob, dog), (Bob, cat)},
110
- # {(Paul, Bob)})
111
- # (Jane, {},
112
- # {(Paul, Jane)})
113
- #
114
- # In this example tuples are co-grouped and the INNER keyword is used to
115
- # ensure that only bags with at least one tuple are returned.
116
- #
117
- # X = COGROUP A BY owner INNER, B BY friend2 INNER;
118
- #
119
- # Relation X looks like this.
120
- #
121
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
122
- # {(Cindy, Alice), (Mark, Alice)})
123
- # (Bob, {(Bob, dog), (Bob, cat)},
124
- # {(Paul, Bob)})
125
- #
126
- # In this example tuples are co-grouped and the INNER keyword is used
127
- # asymmetrically on only one of the relations.
128
- #
129
- # X = COGROUP A BY owner, B BY friend2 INNER;
130
- #
131
- # Relation X looks like this.
132
- #
133
- # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
134
- # {(Cindy, Alice), (Mark, Alice)})
135
- # (Bob, {(Bob, dog), (Bob, cat)},
136
- # {(Paul, Bob)})
137
- # (Jane, {},
138
- # {(Paul, Jane)})
139
- #
140
- #
141
- def group group_by
142
- l_klass = l_klass_for_group group_by
143
- by_clause = self.class.make_by_clause(group_by)
144
- new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
145
- end
146
-
147
- def self.make_by_clause by_spec
148
- case by_spec
149
- when Array then 'BY ' + by_spec.join(", ")
150
- when :all then 'ALL'
151
- when Symbol then "BY #{by_spec}"
152
- when String then by_spec
153
- when Hash then make_by_clause(by_spec[:by])
154
- else raise "Don't know how to group on #{by_spec.inspect}"
155
- end
156
- end
157
- def types_for_fields field
158
- klass.members_types[field]
159
- end
160
- def l_klass_for_group group_by
161
- self.class.l_klass_for_group group_by, self
162
- end
163
- def self.l_klass_for_group group_by, *rels
164
- TypedStruct.new(
165
- [:group, rels.first.types_for_fields(group_by)],
166
- *rels.map{|rel| [rel.relation, rel.klass] }
167
- )
168
- end
169
-
170
- #
171
- # COGROUP pig expression:
172
- # UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
173
- #
174
- def self.cogroup lval, *by
175
- by_clause = by.map do |relation, group_by, as|
176
- "%s %s" % [relation.relation, make_by_clause(group_by)]
177
- end.join(", ")
178
- l_klass = l_klass_for_group by[0][1], *by.map(&:first)
179
- rval = new l_klass, lval, "COGROUP #{by_clause}"
180
- set lval, rval
181
- end
182
-
183
- def cogroup *args
184
- self.class.cogroup self, *args
185
- end
186
-
187
-
188
- # ===========================================================================
189
- #
190
- # JOIN
191
- #
192
- def self.klass_from_join by
193
- klasses = by.map(&:first)
194
- TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
195
- end
196
-
197
- def self.join_by_clause by
198
- by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
199
- end
200
-
201
- def self.join lval, by
202
- parallel = by.delete(:parallel)
203
- cmd = "JOIN " + join_by_clause(by)
204
- parallelize! cmd, :parallel => parallel
205
- l_klass = klass_from_join(by)
206
- rval = new(l_klass, lval, cmd)
207
- set lval, rval
208
- end
209
-
210
- end
211
- end
212
- end
@@ -1,65 +0,0 @@
1
- # == Load/StoreFunctions ==
2
- # BinaryDeserializer
3
- # BinarySerializer
4
- # BinStorage
5
- # PigStorage
6
- # PigDump
7
- # TextLoader
8
-
9
- module Wukong
10
- module AndPig
11
- class PigVar
12
- #===========================================================================
13
- #
14
- # The "LOAD" pig expression:
15
- # MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
16
- #
17
- # The AS type spec is generated from klass
18
- #
19
- def self.pig_load rel, klass, options={ }
20
- filename = options[:filename] || default_filename(rel)
21
- self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
22
- if options[:has_rsrc]
23
- lval = self[rel]
24
- lval.generate lval, *lval.fields
25
- end
26
- rel
27
- end
28
-
29
- #===========================================================================
30
- #
31
- #
32
- # The "STORE" pig imperative:
33
- # STORE Relation INTO 'filename'
34
- # If no filename is given, the relation's name is used
35
- #
36
- def store filename=nil
37
- filename ||= default_filename
38
- self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
39
- self
40
- end
41
-
42
- # Store the relation, removing the existing file
43
- def store! filename=nil
44
- filename ||= default_filename
45
- rmf! filename
46
- mkdir File.dirname(filename)
47
- store filename
48
- end
49
-
50
- # Force a store to disk, then load (so all calculations proceed from there)
51
- def checkpoint! options={}
52
- options = options.reverse_merge :filename => default_filename
53
- store! options[:filename]
54
- self.class.pig_load(self.name, klass, options)
55
- end
56
-
57
- def default_filename
58
- self.class.default_filename self.name
59
- end
60
- def self.default_filename name
61
- File.join(working_dir, name.to_s)
62
- end
63
- end
64
- end
65
- end
@@ -1,42 +0,0 @@
1
- # == DiagnosticOperators
2
- # describe
3
- # dump
4
- # explain
5
- # illustrate
6
- # == UDFStatements
7
- # define
8
- # register
9
-
10
- module Wukong
11
- module AndPig
12
- class PigVar
13
- # DESCRIBE pig imperative
14
- def describe
15
- self.class.describe self
16
- end
17
- def self.describe rel
18
- emit %Q{ -- PREDICTED: #{rel.klass.typify} }
19
- simple_declaration :describe, rel.relationize
20
- rel
21
- end
22
-
23
- # DUMP pig imperative
24
- def dump() simple_operation :dump end
25
-
26
- # EXPLAIN pig imperative
27
- def explain() simple_operation :explain end
28
-
29
- # ILLUSTRATE pig imperative
30
- def illustrate() simple_operation :illustrate end
31
-
32
-
33
- def self.define pig_alias, *args
34
- emit_imperative :DEFINE, pig_alias, args
35
- end
36
-
37
- def self.register path_to_jar
38
- emit_imperative :REGISTER, path_to_jar
39
- end
40
- end
41
- end
42
- end