wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,85 @@
1
+ require 'wukong/and_pig/generate/variable_inflections'
2
+
3
+ module Wukong
4
+ module AndPig
5
+
6
+ mattr_accessor :comments
7
+ self.comments = true
8
+ # send output to stdout or to captured pig instance
9
+ mattr_accessor :emit_dest
10
+ # full pathname to the pig executable
11
+ PIG_EXECUTABLE = '/usr/local/bin/pig'
12
+
13
+ def self.finish
14
+ PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
15
+ end
16
+
17
+ #
18
+ # All the embarrassing magick to pretend ruby symbols are pig relations
19
+ #
20
+ class PigVar
21
+
22
+ # Output a command
23
+ def self.emit cmd, semicolon=true
24
+ cmd = cmd + ' ;' if semicolon
25
+ case Wukong::AndPig.emit_dest
26
+ when :captured
27
+ pig_in_poke.puts(cmd)
28
+ pig_in_poke.flush
29
+ puts pig_in_poke.gets
30
+ else
31
+ puts(cmd)
32
+ end
33
+ end
34
+
35
+ # generate the code
36
+ def self.emit_setter relation, rval
37
+ emit "%-23s\t= %s" % [relation, rval.cmd]
38
+ rval
39
+ end
40
+
41
+ # generate the code
42
+ def self.emit_imperative imperative, *rest
43
+ cmd_part = "%-14s \t" % imperative
44
+ arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
45
+ emit cmd_part+arg_part
46
+ rest.first
47
+ end
48
+
49
+ def self.pig_in_poke
50
+ return @pig_in_poke if @pig_in_poke
51
+ case Wukong::AndPig.emit_dest
52
+ when :captured
53
+ @pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
54
+ @pig_in_poke.sync = true
55
+ @pig_in_poke
56
+ else @pig_in_poke = $stdout
57
+ end
58
+ end
59
+
60
+ #
61
+ # Reset the captured pig instance
62
+ #
63
+ def self.reset_pig_in_poke!
64
+ begin pig_in_poke.close ; rescue nil ; end
65
+ @pig_in_poke = nil
66
+ end
67
+
68
+ def set!
69
+ self.class.emit_setter(relation, self)
70
+ end
71
+
72
+ #
73
+ # Emit a comment
74
+ # skips if Wukong::AndPig.comments is false
75
+ #
76
+ def self.rem comment
77
+ return unless Wukong::AndPig.comments
78
+ PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
79
+ end
80
+ end
81
+
82
+ end
83
+ end
84
+
85
+
@@ -0,0 +1,82 @@
1
+ String.class_eval do
2
+ #
3
+ # Generate relation name from a handle
4
+ #
5
+ def relationize() camelize end
6
+ end
7
+ Symbol.class_eval do
8
+ #
9
+ # Generate relation name from a handle
10
+ #
11
+ def relationize
12
+ to_s.relationize
13
+ end
14
+ end
15
+
16
+ Object.class_eval do
17
+ def typify() self.class ; end
18
+
19
+ def symbolize
20
+ self.to_s.underscore.gsub(%r{.*/}, '').to_sym
21
+ end
22
+ end
23
+
24
+ class << Integer ; def typify() 'int' end ; end
25
+ class << Bignum ; def typify() 'long' end ; end
26
+ class << Float ; def typify() 'float' end ; end
27
+ class << String ; def typify() 'chararray' end ; end
28
+ class << Symbol ; def typify() self end ; end
29
+ class << Date ; def typify() 'long' end ; end
30
+
31
+ # Array.class_eval do
32
+ # def typify()
33
+ # "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
34
+ # end
35
+ # end
36
+ # class Tuple
37
+ # attr_accessor :contents
38
+ # def initialize *args
39
+ # self.contents = args
40
+ # end
41
+ # def typify
42
+ # "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
43
+ # end
44
+ # #
45
+ # # Sugar for creating a new bag. The following are equivalent:
46
+ # #
47
+ # # Bag[:foo]
48
+ # # Bag.new :foo
49
+ # #
50
+ # def self.[] *args
51
+ # new *args
52
+ # end
53
+ # end
54
+
55
+ module BagMethods
56
+ module ClassMethods
57
+ #
58
+ # Pig type string --
59
+ # the pig type strings for each sub-element.
60
+ #
61
+ def typify
62
+ vars_str = members.zip(mtypes).map do |attr, mtype|
63
+ "%s: %s" % [attr, mtype.typify]
64
+ end
65
+ "{ #{vars_str.join(', ')} }"
66
+ end
67
+ end
68
+ def self.included base
69
+ base.extend ClassMethods
70
+ end
71
+ end
72
+
73
+ class Bag < TypedStruct
74
+ def self.new *args
75
+ bag = super *args
76
+ bag.class_eval{ include BagMethods }
77
+ end
78
+ def self.[] *args
79
+ new *args
80
+ end
81
+ end
82
+
@@ -0,0 +1,51 @@
1
+
2
+
3
+ module Wukong
4
+ module AndPig
5
+
6
+ #
7
+ # Load the main class definitions
8
+ #
9
+ def self.init_load
10
+ puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
11
+ end
12
+
13
+
14
+
15
+
16
+ #
17
+ # OK we're going to cheat here:
18
+ # just cat the file in, and treat it as a scalar
19
+ #
20
+ def load_scalar path
21
+ # var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
22
+ var = "636"
23
+ end
24
+
25
+
26
+
27
+ def count_distinct dest_rel, attr, group_by
28
+ distincted =
29
+ generate(temp_rel(dest_rel), attr).
30
+ distinct(temp_rel(dest_rel), :parallel => 10)
31
+ distincted.
32
+ group( temp_rel(dest_rel), group_by).
33
+ foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
34
+ end
35
+
36
+ #
37
+ # Group a relation into bins, and return the counts for each bin
38
+ # * dest_rel - Relation to store
39
+ # {bin,
40
+ #
41
+ def histogram dest_rel, bin_attr, bin_expr=nil
42
+ bin_expr ||= bin_attr
43
+ bin_name = "#{bin_attr}_bin"
44
+ binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
45
+ binned. group( temp_rel(dest_rel), :by => bin_name).
46
+ foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
47
+ end
48
+
49
+
50
+ end
51
+ end
@@ -0,0 +1,8 @@
1
+ require 'wukong/and_pig/operators/evaluators'
2
+ require 'wukong/and_pig/operators/foreach'
3
+ require 'wukong/and_pig/operators/groupies'
4
+ require 'wukong/and_pig/operators/load_store'
5
+ require 'wukong/and_pig/operators/meta'
6
+ require 'wukong/and_pig/operators/relational'
7
+ require 'wukong/and_pig/operators/file_methods'
8
+ require 'wukong/and_pig/operators/compound'
@@ -0,0 +1,29 @@
1
+ #
2
+ # The FOREACH relational operator
3
+ #
4
+ module Wukong
5
+ module AndPig
6
+ class PigVar
7
+ #
8
+ # Select all elements in the source relation that match on the selecting relation,
9
+ # creating a relation with the same type as the source relation.
10
+ #
11
+ # For example,
12
+ #
13
+ # PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
14
+ #
15
+ # returns a relation IsolatedCvals, whose type is identical to
16
+ # MyComplicatedValues' type, with only the elements having an id also
17
+ # presend in MyIds.
18
+ #
19
+ #
20
+ def self.isolate lval, on, on_field, from, from_field, options={ }
21
+ joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
22
+ isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
23
+ isolated.klass = from.klass
24
+ isolated
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,7 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+
5
+ # ===========================================================================
6
+ #
7
+ # STREAM
8
+ #
9
+ def stream options={}
10
+ new_in_chain klass, "STREAM #{relation}"
11
+ end
12
+ end
13
+ end
14
+ end
15
+
@@ -0,0 +1,29 @@
1
+ module Wukong
2
+ module AndPig
3
+ class PigVar
4
+ # ===========================================================================
5
+ #
6
+ # Pig expressions
7
+ #
8
+
9
+ #
10
+ def dfs cmd, filename
11
+ # note == no '' on path
12
+ self.class.emit "%-23s\t %s" % [cmd, filename]
13
+ end
14
+ #
15
+ # remove the stored file
16
+ #
17
+ def rmf! filename
18
+ dfs :rmf, filename
19
+ end
20
+
21
+ #
22
+ #
23
+ #
24
+ def mkdir filename
25
+ dfs :mkdir, filename
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,98 @@
1
+ #
2
+ # The FOREACH relational operator
3
+ #
4
+ module Wukong
5
+ module AndPig
6
+ class PigVar
7
+
8
+ # ===========================================================================
9
+ #
10
+ # FOREACH
11
+ #
12
+ def generate lval, *field_specs
13
+ gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
14
+ l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
15
+ l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
16
+ new_in_chain(lval, l_klass, l_cmd)
17
+ end
18
+
19
+ #
20
+ # for a list of GENERATE args, we need
21
+ #
22
+ # * gen_clauses, the clause to stuff into the GENERATE line
23
+ # gen_expr AS gen_field_name: gen_field_type
24
+ #
25
+ # * new_types, the resulting types for each
26
+ #
27
+ # gen_expr common cases include
28
+ #
29
+ # field
30
+ # Rel::field
31
+ # Rel.(field)
32
+ # "ComplicatedExpression"
33
+ #
34
+ #
35
+ # field_attrs
36
+ #
37
+ #
38
+ def parse_gen_clause field_spec
39
+ case field_spec
40
+ when AS
41
+ field_spec
42
+ when Symbol
43
+ AS[field_spec, field_spec, field_type(field_spec)];
44
+ when Array
45
+ alias_in, field_in, name, type = field_spec
46
+ name ||= field_in
47
+ type = alias_in.field_type(field_in)
48
+ AS[field_in, name, type, alias_in.relationize]
49
+ when Hash
50
+ field_spec.map do |field_in, field_out|
51
+ AS[field_in, field_out, field_type(field_in)]
52
+ end
53
+ else raise "Don't know how to specify type for #{field_specs.inspect}"
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ # # when Array
67
+ # # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
68
+ # # field_expr, field_attr, field_type = field_spec
69
+ # # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
70
+ # # gen_clauses << "#{field_expr} AS #{field_as}"
71
+ # # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
72
+
73
+ # def prelimify *field_specs
74
+ # gen_clauses = []
75
+ # field_attrs = []
76
+ # field_specs.map do |field_spec|
77
+ # unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
78
+ # field_expr, field_attr = field_spec
79
+ # gen_clauses << "#{field_expr}"
80
+ # field_attrs += [field_attr].flatten
81
+ # end
82
+ # [ gen_clauses, field_attrs ]
83
+ # end
84
+ #
85
+ # # def generate *args
86
+ # # gen_clauses, field_attrs = self.class.fieldify *args
87
+ # # l_klass = TypedStruct.new(*field_attrs)
88
+ # # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
89
+ # # end
90
+ #
91
+ # def foreach *args
92
+ # generate_clause = args.pop
93
+ # prelim_exprs, prelim_attrs = prelimify *args
94
+ # prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
95
+ # gen_clauses, field_attrs = fieldify *generate_clause
96
+ # l_klass = TypedStruct.new(*field_attrs)
97
+ # new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
98
+ # end
@@ -0,0 +1,212 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # The FOREACH relational operator
4
+ #
5
+ module Wukong
6
+ module AndPig
7
+ class PigVar
8
+
9
+ #===========================================================================
10
+ #
11
+ # GROUP and COGROUP
12
+ #
13
+
14
+ #
15
+ # COGROUP - Groups the data in two or more relations.
16
+ #
17
+ # == Syntax
18
+ #
19
+ # alias = COGROUP alias1 BY field_alias [INNER | OUTER],
20
+ # aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
21
+ #
22
+ # == Structure
23
+ #
24
+ # { group, <structure of alias1>, <structure of alias2>, ... }
25
+ #
26
+ # == Terms
27
+ #
28
+ # * alias The name a relation.
29
+ #
30
+ # * field_alias The name of one or more fields in a relation. If multiple
31
+ # fields are specified, separate with commas and enclose
32
+ # in parentheses. For example, X = COGROUP A BY (f1, f2);
33
+ #
34
+ # The number of fields specified in each BY clause must
35
+ # match. For example, X = COGROUP A BY (a1,a2,a3), B BY
36
+ # (b1,b2,b3);
37
+ #
38
+ # * BY Keyword.
39
+ #
40
+ # * INNER Eliminate NULLs on that grouping
41
+ # * OUTER Do not eliminate NULLs on that grouping (default)
42
+ #
43
+ # * PARALLEL n -- Increase the parallelism of a job by specifying the
44
+ # number of reduce tasks, n. The optimal number of
45
+ # parallel tasks depends on the amount of memory on each
46
+ # node and the memory required by each of the tasks. To
47
+ # determine n, use the following as a general guideline:
48
+ #
49
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
50
+ #
51
+ # where nr_nodes is the number of nodes used and nr_GB is
52
+ # the amount of physical memory on each node.
53
+ #
54
+ # Note the following:
55
+ # - Parallel only affects the number of reduce tasks. Map
56
+ # parallelism is determined by the input file, one map
57
+ # for each HDFS block.
58
+ # - If you don’t specify parallel, you still get the same
59
+ # map parallelism but only one reduce task.
60
+ #
61
+ # == Usage
62
+ #
63
+ # The COGOUP operator groups the data in two or more relations based on
64
+ # the common field values.
65
+ #
66
+ # Note: The COGROUP and JOIN operators perform similar functions. COGROUP
67
+ # creates a nested set of output tuples while JOIN creates a flat set of
68
+ # output tuples with NULLs eliminated.
69
+ #
70
+ # == Examples
71
+ #
72
+ # Suppose we have two relations, A and B.
73
+ #
74
+ # A: (owner:chararray, pet:chararray)
75
+ # ---------------
76
+ # (Alice, cat)
77
+ # (Alice, goldfish)
78
+ # (Alice, turtle)
79
+ # (Bob, cat)
80
+ # (Bob, dog)
81
+ #
82
+ # B: (friend1:chararray, friend2:charrarray)
83
+ # ---------------------
84
+ # (Cindy, Alice)
85
+ # (Mark, Alice)
86
+ # (Paul, Bob)
87
+ # (Paul, Jane)
88
+ #
89
+ # In this example tuples are co-grouped using field “owner” from relation
90
+ # A and field “friend2” from relation B as the key fields. The DESCRIBE
91
+ # operator shows the schema for relation X, which has two fields, "group"
92
+ # and "A" (for an explanation, see GROUP).
93
+ #
94
+ # X = COGROUP A BY owner, B BY friend2;
95
+ # DESCRIBE X;
96
+ #
97
+ # X: {group: chararray,
98
+ # A: {owner: chararray,pet: chararray},
99
+ # B: {friend1: chararray,friend2: chararray}}
100
+ #
101
+ # Relation X looks like this. A tuple is created for each unique key
102
+ # field. The tuple includes the key field and two bags. The first bag is
103
+ # the tuples from the first relation with the matching key field. The
104
+ # second bag is the tuples from the second relation with the matching key
105
+ # field. If no tuples match the key field, the bag is empty.
106
+ #
107
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
108
+ # {(Cindy, Alice), (Mark, Alice)})
109
+ # (Bob, {(Bob, dog), (Bob, cat)},
110
+ # {(Paul, Bob)})
111
+ # (Jane, {},
112
+ # {(Paul, Jane)})
113
+ #
114
+ # In this example tuples are co-grouped and the INNER keyword is used to
115
+ # ensure that only bags with at least one tuple are returned.
116
+ #
117
+ # X = COGROUP A BY owner INNER, B BY friend2 INNER;
118
+ #
119
+ # Relation X looks like this.
120
+ #
121
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
122
+ # {(Cindy, Alice), (Mark, Alice)})
123
+ # (Bob, {(Bob, dog), (Bob, cat)},
124
+ # {(Paul, Bob)})
125
+ #
126
+ # In this example tuples are co-grouped and the INNER keyword is used
127
+ # asymmetrically on only one of the relations.
128
+ #
129
+ # X = COGROUP A BY owner, B BY friend2 INNER;
130
+ #
131
+ # Relation X looks like this.
132
+ #
133
+ # (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
134
+ # {(Cindy, Alice), (Mark, Alice)})
135
+ # (Bob, {(Bob, dog), (Bob, cat)},
136
+ # {(Paul, Bob)})
137
+ # (Jane, {},
138
+ # {(Paul, Jane)})
139
+ #
140
+ #
141
+ def group group_by
142
+ l_klass = l_klass_for_group group_by
143
+ by_clause = self.class.make_by_clause(group_by)
144
+ new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
145
+ end
146
+
147
+ def self.make_by_clause by_spec
148
+ case by_spec
149
+ when Array then 'BY ' + by_spec.join(", ")
150
+ when :all then 'ALL'
151
+ when Symbol then "BY #{by_spec}"
152
+ when String then by_spec
153
+ when Hash then make_by_clause(by_spec[:by])
154
+ else raise "Don't know how to group on #{by_spec.inspect}"
155
+ end
156
+ end
157
+ def types_for_fields field
158
+ klass.members_types[field]
159
+ end
160
+ def l_klass_for_group group_by
161
+ self.class.l_klass_for_group group_by, self
162
+ end
163
+ def self.l_klass_for_group group_by, *rels
164
+ TypedStruct.new(
165
+ [:group, rels.first.types_for_fields(group_by)],
166
+ *rels.map{|rel| [rel.relation, rel.klass] }
167
+ )
168
+ end
169
+
170
+ #
171
+ # COGROUP pig expression:
172
+ # UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
173
+ #
174
+ def self.cogroup lval, *by
175
+ by_clause = by.map do |relation, group_by, as|
176
+ "%s %s" % [relation.relation, make_by_clause(group_by)]
177
+ end.join(", ")
178
+ l_klass = l_klass_for_group by[0][1], *by.map(&:first)
179
+ rval = new l_klass, lval, "COGROUP #{by_clause}"
180
+ set lval, rval
181
+ end
182
+
183
+ def cogroup *args
184
+ self.class.cogroup self, *args
185
+ end
186
+
187
+
188
+ # ===========================================================================
189
+ #
190
+ # JOIN
191
+ #
192
+ def self.klass_from_join by
193
+ klasses = by.map(&:first)
194
+ TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
195
+ end
196
+
197
+ def self.join_by_clause by
198
+ by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
199
+ end
200
+
201
+ def self.join lval, by
202
+ parallel = by.delete(:parallel)
203
+ cmd = "JOIN " + join_by_clause(by)
204
+ parallelize! cmd, :parallel => parallel
205
+ l_klass = klass_from_join(by)
206
+ rval = new(l_klass, lval, cmd)
207
+ set lval, rval
208
+ end
209
+
210
+ end
211
+ end
212
+ end