mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,85 @@
1
+ require 'rubygems'
2
+ require 'active_support'
3
+
4
+ String.class_eval do
5
+ #
6
+ # Generate relation name from a handle
7
+ #
8
+ def relationize() camelize end
9
+ end
10
+ Symbol.class_eval do
11
+ #
12
+ # Generate relation name from a handle
13
+ #
14
+ def relationize
15
+ to_s.relationize
16
+ end
17
+ end
18
+
19
+ Object.class_eval do
20
+ def typify() self.class ; end
21
+
22
+ def symbolize
23
+ self.to_s.underscore.gsub(%r{.*/}, '').to_sym
24
+ end
25
+ end
26
+
27
+ class << Integer ; def typify() 'int' end ; end
28
+ class << Bignum ; def typify() 'long' end ; end
29
+ class << Float ; def typify() 'float' end ; end
30
+ class << String ; def typify() 'chararray' end ; end
31
+ class << Symbol ; def typify() self end ; end
32
+ class << Date ; def typify() 'long' end ; end
33
+
34
+ # Array.class_eval do
35
+ # def typify()
36
+ # "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
37
+ # end
38
+ # end
39
+ # class Tuple
40
+ # attr_accessor :contents
41
+ # def initialize *args
42
+ # self.contents = args
43
+ # end
44
+ # def typify
45
+ # "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
46
+ # end
47
+ # #
48
+ # # Sugar for creating a new bag. The following are equivalent:
49
+ # #
50
+ # # Bag[:foo]
51
+ # # Bag.new :foo
52
+ # #
53
+ # def self.[] *args
54
+ # new *args
55
+ # end
56
+ # end
57
+
58
+ module BagMethods
59
+ module ClassMethods
60
+ #
61
+ # Pig type string --
62
+ # the pig type strings for each sub-element.
63
+ #
64
+ def typify
65
+ vars_str = members.zip(mtypes).map do |attr, mtype|
66
+ "%s: %s" % [attr, mtype.typify]
67
+ end
68
+ "{ #{vars_str.join(', ')} }"
69
+ end
70
+ end
71
+ def self.included base
72
+ base.extend ClassMethods
73
+ end
74
+ end
75
+
76
+ class Bag < TypedStruct
77
+ def self.new *args
78
+ bag = super *args
79
+ bag.class_eval{ include BagMethods }
80
+ end
81
+ def self.[] *args
82
+ new *args
83
+ end
84
+ end
85
+
@@ -0,0 +1,51 @@
1
+
2
+
3
+ module Wukong
4
+ module AndPig
5
+
6
+ #
7
+ # Load the main class definitions
8
+ #
9
+ def self.init_load
10
+ puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
11
+ end
12
+
13
+
14
+
15
+
16
+ #
17
+ # OK we're going to cheat here:
18
+ # just cat the file in, and treat it as a scalar
19
+ #
20
+ def load_scalar path
21
+ # var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
22
+ var = "636"
23
+ end
24
+
25
+
26
+
27
+ def count_distinct dest_rel, attr, group_by
28
+ distincted =
29
+ generate(temp_rel(dest_rel), attr).
30
+ distinct(temp_rel(dest_rel), :parallel => 10)
31
+ distincted.
32
+ group( temp_rel(dest_rel), group_by).
33
+ foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
34
+ end
35
+
36
+ #
37
+ # Group a relation into bins, and return the counts for each bin
38
+ # * dest_rel - Relation to store
39
+ # {bin,
40
+ #
41
+ def histogram dest_rel, bin_attr, bin_expr=nil
42
+ bin_expr ||= bin_attr
43
+ bin_name = "#{bin_attr}_bin"
44
+ binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
45
+ binned. group( temp_rel(dest_rel), :by => bin_name).
46
+ foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
47
+ end
48
+
49
+
50
+ end
51
+ end
@@ -0,0 +1,8 @@
1
+ require 'wukong/and_pig/operators/evaluators'
2
+ require 'wukong/and_pig/operators/foreach'
3
+ require 'wukong/and_pig/operators/groupies'
4
+ require 'wukong/and_pig/operators/load_store'
5
+ require 'wukong/and_pig/operators/meta'
6
+ require 'wukong/and_pig/operators/relational'
7
+ require 'wukong/and_pig/operators/file_methods'
8
+ require 'wukong/and_pig/operators/compound'
@@ -0,0 +1,29 @@
1
+ #
2
+ # The FOREACH relational operator
3
+ #
4
+ module Wukong
5
+ module AndPig
6
+ class PigVar
7
+ #
8
+ # Select all elements in the source relation that match on the selecting relation,
9
+ # creating a relation with the same type as the source relation.
10
+ #
11
+ # For example,
12
+ #
13
+ # PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
14
+ #
15
+ # returns a relation IsolatedCvals, whose type is identical to
16
+ # MyComplicatedValues' type, with only the elements having an id also
17
+ # presend in MyIds.
18
+ #
19
+ #
20
+ def self.isolate lval, on, on_field, from, from_field, options={ }
21
+ joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
22
+ isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
23
+ isolated.klass = from.klass
24
+ isolated
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,7 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+
5
+ # ===========================================================================
6
+ #
7
+ # STREAM
8
+ #
9
+ def stream options={}
10
+ new_in_chain klass, "STREAM #{relation}"
11
+ end
12
+ end
13
+ end
14
+ end
15
+
@@ -0,0 +1,29 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+ # ===========================================================================
5
+ #
6
+ # Pig expressions
7
+ #
8
+
9
+ #
10
+ def dfs cmd, filename
11
+ # note == no '' on path
12
+ self.class.emit "%-23s\t %s" % [cmd, filename]
13
+ end
14
+ #
15
+ # remove the stored file
16
+ #
17
+ def rmf! filename
18
+ dfs :rmf, filename
19
+ end
20
+
21
+ #
22
+ #
23
+ #
24
+ def mkdir filename
25
+ dfs :mkdir, filename
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,98 @@
1
+ #
2
+ # The FOREACH relational operator
3
+ #
4
+ module Wukong
5
+ module AndPig
6
+ class PigVar
7
+
8
+ # ===========================================================================
9
+ #
10
+ # FOREACH
11
+ #
12
+ def generate lval, *field_specs
13
+ gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
14
+ l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
15
+ l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
16
+ new_in_chain(lval, l_klass, l_cmd)
17
+ end
18
+
19
+ #
20
+ # for a list of GENERATE args, we need
21
+ #
22
+ # * gen_clauses, the clause to stuff into the GENERATE line
23
+ # gen_expr AS gen_field_name: gen_field_type
24
+ #
25
+ # * new_types, the resulting types for each
26
+ #
27
+ # gen_expr common cases include
28
+ #
29
+ # field
30
+ # Rel::field
31
+ # Rel.(field)
32
+ # "ComplicatedExpression"
33
+ #
34
+ #
35
+ # field_attrs
36
+ #
37
+ #
38
+ def parse_gen_clause field_spec
39
+ case field_spec
40
+ when AS
41
+ field_spec
42
+ when Symbol
43
+ AS[field_spec, field_spec, field_type(field_spec)];
44
+ when Array
45
+ alias_in, field_in, name, type = field_spec
46
+ name ||= field_in
47
+ type = alias_in.field_type(field_in)
48
+ AS[field_in, name, type, alias_in.relationize]
49
+ when Hash
50
+ field_spec.map do |field_in, field_out|
51
+ AS[field_in, field_out, field_type(field_in)]
52
+ end
53
+ else raise "Don't know how to specify type for #{field_specs.inspect}"
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ # # when Array
67
+ # # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
68
+ # # field_expr, field_attr, field_type = field_spec
69
+ # # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
70
+ # # gen_clauses << "#{field_expr} AS #{field_as}"
71
+ # # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
72
+
73
+ # def prelimify *field_specs
74
+ # gen_clauses = []
75
+ # field_attrs = []
76
+ # field_specs.map do |field_spec|
77
+ # unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
78
+ # field_expr, field_attr = field_spec
79
+ # gen_clauses << "#{field_expr}"
80
+ # field_attrs += [field_attr].flatten
81
+ # end
82
+ # [ gen_clauses, field_attrs ]
83
+ # end
84
+ #
85
+ # # def generate *args
86
+ # # gen_clauses, field_attrs = self.class.fieldify *args
87
+ # # l_klass = TypedStruct.new(*field_attrs)
88
+ # # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
89
+ # # end
90
+ #
91
+ # def foreach *args
92
+ # generate_clause = args.pop
93
+ # prelim_exprs, prelim_attrs = prelimify *args
94
+ # prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
95
+ # gen_clauses, field_attrs = fieldify *generate_clause
96
+ # l_klass = TypedStruct.new(*field_attrs)
97
+ # new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
98
+ # end
@@ -0,0 +1,212 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # The FOREACH relational operator
4
+ #
5
+ module Wukong
6
+ module AndPig
7
+ class PigVar
8
+
9
+ #===========================================================================
10
+ #
11
+ # GROUP and COGROUP
12
+ #
13
+
14
+ #
15
+ # COGROUP - Groups the data in two or more relations.
16
+ #
17
+ # == Syntax
18
+ #
19
+ # alias = COGROUP alias1 BY field_alias [INNER | OUTER],
20
+ # aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
21
+ #
22
+ # == Structure
23
+ #
24
+ # { group, <structure of alias1>, <structure of alias2>, ... }
25
+ #
26
+ # == Terms
27
+ #
28
+ # * alias The name a relation.
29
+ #
30
+ # * field_alias The name of one or more fields in a relation. If multiple
31
+ # fields are specified, separate with commas and enclose
32
+ # in parentheses. For example, X = COGROUP A BY (f1, f2);
33
+ #
34
+ # The number of fields specified in each BY clause must
35
+ # match. For example, X = COGROUP A BY (a1,a2,a3), B BY
36
+ # (b1,b2,b3);
37
+ #
38
+ # * BY Keyword.
39
+ #
40
+ # * INNER Eliminate NULLs on that grouping
41
+ # * OUTER Do not eliminate NULLs on that grouping (default)
42
+ #
43
+ # * PARALLEL n -- Increase the parallelism of a job by specifying the
44
+ # number of reduce tasks, n. The optimal number of
45
+ # parallel tasks depends on the amount of memory on each
46
+ # node and the memory required by each of the tasks. To
47
+ # determine n, use the following as a general guideline:
48
+ #
49
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
50
+ #
51
+ # where nr_nodes is the number of nodes used and nr_GB is
52
+ # the amount of physical memory on each node.
53
+ #
54
+ # Note the following:
55
+ # - Parallel only affects the number of reduce tasks. Map
56
+ # parallelism is determined by the input file, one map
57
+ # for each HDFS block.
58
+ # - If you don’t specify parallel, you still get the same
59
+ # map parallelism but only one reduce task.
60
+ #
61
+ # == Usage
62
+ #
63
+ # The COGOUP operator groups the data in two or more relations based on
64
+ # the common field values.
65
+ #
66
+ # Note: The COGROUP and JOIN operators perform similar functions. COGROUP
67
+ # creates a nested set of output tuples while JOIN creates a flat set of
68
+ # output tuples with NULLs eliminated.
69
+ #
70
+ # == Examples
71
+ #
72
+ # Suppose we have two relations, A and B.
73
+ #
74
+ # A: (owner:chararray, pet:chararray)
75
+ # ---------------
76
+ # (Alice, cat)
77
+ # (Alice, goldfish)
78
+ # (Alice, turtle)
79
+ # (Bob, cat)
80
+ # (Bob, dog)
81
+ #
82
+ # B: (friend1:chararray, friend2:charrarray)
83
+ # ---------------------
84
+ # (Cindy, Alice)
85
+ # (Mark, Alice)
86
+ # (Paul, Bob)
87
+ # (Paul, Jane)
88
+ #
89
+ # In this example tuples are co-grouped using field “owner” from relation
90
+ # A and field “friend2” from relation B as the key fields. The DESCRIBE
91
+ # operator shows the schema for relation X, which has two fields, "group"
92
+ # and "A" (for an explanation, see GROUP).
93
+ #
94
+ # X = COGROUP A BY owner, B BY friend2;
95
+ # DESCRIBE X;
96
+ #
97
+ # X: {group: chararray,
98
+ # A: {owner: chararray,pet: chararray},
99
+ # B: {friend1: chararray,friend2: chararray}}
100
+ #
101
+ # Relation X looks like this. A tuple is created for each unique key
102
+ # field. The tuple includes the key field and two bags. The first bag is
103
+ # the tuples from the first relation with the matching key field. The
104
+ # second bag is the tuples from the second relation with the matching key
105
+ # field. If no tuples match the key field, the bag is empty.
106
+ #
107
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
108
+ # {(Cindy, Alice), (Mark, Alice)})
109
+ # (Bob, {(Bob, dog), (Bob, cat)},
110
+ # {(Paul, Bob)})
111
+ # (Jane, {},
112
+ # {(Paul, Jane)})
113
+ #
114
+ # In this example tuples are co-grouped and the INNER keyword is used to
115
+ # ensure that only bags with at least one tuple are returned.
116
+ #
117
+ # X = COGROUP A BY owner INNER, B BY friend2 INNER;
118
+ #
119
+ # Relation X looks like this.
120
+ #
121
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
122
+ # {(Cindy, Alice), (Mark, Alice)})
123
+ # (Bob, {(Bob, dog), (Bob, cat)},
124
+ # {(Paul, Bob)})
125
+ #
126
+ # In this example tuples are co-grouped and the INNER keyword is used
127
+ # asymmetrically on only one of the relations.
128
+ #
129
+ # X = COGROUP A BY owner, B BY friend2 INNER;
130
+ #
131
+ # Relation X looks like this.
132
+ #
133
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
134
+ # {(Cindy, Alice), (Mark, Alice)})
135
+ # (Bob, {(Bob, dog), (Bob, cat)},
136
+ # {(Paul, Bob)})
137
+ # (Jane, {},
138
+ # {(Paul, Jane)})
139
+ #
140
+ #
141
+ def group group_by
142
+ l_klass = l_klass_for_group group_by
143
+ by_clause = self.class.make_by_clause(group_by)
144
+ new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
145
+ end
146
+
147
+ def self.make_by_clause by_spec
148
+ case by_spec
149
+ when Array then 'BY ' + by_spec.join(", ")
150
+ when :all then 'ALL'
151
+ when Symbol then "BY #{by_spec}"
152
+ when String then by_spec
153
+ when Hash then make_by_clause(by_spec[:by])
154
+ else raise "Don't know how to group on #{by_spec.inspect}"
155
+ end
156
+ end
157
+ def types_for_fields field
158
+ klass.members_types[field]
159
+ end
160
+ def l_klass_for_group group_by
161
+ self.class.l_klass_for_group group_by, self
162
+ end
163
+ def self.l_klass_for_group group_by, *rels
164
+ TypedStruct.new(
165
+ [:group, rels.first.types_for_fields(group_by)],
166
+ *rels.map{|rel| [rel.relation, rel.klass] }
167
+ )
168
+ end
169
+
170
+ #
171
+ # COGROUP pig expression:
172
+ # UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
173
+ #
174
+ def self.cogroup lval, *by
175
+ by_clause = by.map do |relation, group_by, as|
176
+ "%s %s" % [relation.relation, make_by_clause(group_by)]
177
+ end.join(", ")
178
+ l_klass = l_klass_for_group by[0][1], *by.map(&:first)
179
+ rval = new l_klass, lval, "COGROUP #{by_clause}"
180
+ set lval, rval
181
+ end
182
+
183
+ def cogroup *args
184
+ self.class.cogroup self, *args
185
+ end
186
+
187
+
188
+ # ===========================================================================
189
+ #
190
+ # JOIN
191
+ #
192
+ def self.klass_from_join by
193
+ klasses = by.map(&:first)
194
+ TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
195
+ end
196
+
197
+ def self.join_by_clause by
198
+ by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
199
+ end
200
+
201
+ def self.join lval, by
202
+ parallel = by.delete(:parallel)
203
+ cmd = "JOIN " + join_by_clause(by)
204
+ parallelize! cmd, :parallel => parallel
205
+ l_klass = klass_from_join(by)
206
+ rval = new(l_klass, lval, cmd)
207
+ set lval, rval
208
+ end
209
+
210
+ end
211
+ end
212
+ end