mrflip-wukong 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +202 -0
- data/README-tutorial.textile +163 -0
- data/README.textile +165 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/UsingWukong-part1.textile +2 -0
- data/doc/UsingWukong-part2.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-setup.textile +21 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +65 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +112 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +40 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +39 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +20 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +173 -0
- metadata +208 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'active_support'
|
3
|
+
|
4
|
+
String.class_eval do
|
5
|
+
#
|
6
|
+
# Generate relation name from a handle
|
7
|
+
#
|
8
|
+
def relationize() camelize end
|
9
|
+
end
|
10
|
+
Symbol.class_eval do
|
11
|
+
#
|
12
|
+
# Generate relation name from a handle
|
13
|
+
#
|
14
|
+
def relationize
|
15
|
+
to_s.relationize
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Object.class_eval do
|
20
|
+
def typify() self.class ; end
|
21
|
+
|
22
|
+
def symbolize
|
23
|
+
self.to_s.underscore.gsub(%r{.*/}, '').to_sym
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class << Integer ; def typify() 'int' end ; end
|
28
|
+
class << Bignum ; def typify() 'long' end ; end
|
29
|
+
class << Float ; def typify() 'float' end ; end
|
30
|
+
class << String ; def typify() 'chararray' end ; end
|
31
|
+
class << Symbol ; def typify() self end ; end
|
32
|
+
class << Date ; def typify() 'long' end ; end
|
33
|
+
|
34
|
+
# Array.class_eval do
|
35
|
+
# def typify()
|
36
|
+
# "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
|
37
|
+
# end
|
38
|
+
# end
|
39
|
+
# class Tuple
|
40
|
+
# attr_accessor :contents
|
41
|
+
# def initialize *args
|
42
|
+
# self.contents = args
|
43
|
+
# end
|
44
|
+
# def typify
|
45
|
+
# "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
|
46
|
+
# end
|
47
|
+
# #
|
48
|
+
# # Sugar for creating a new bag. The following are equivalent:
|
49
|
+
# #
|
50
|
+
# # Bag[:foo]
|
51
|
+
# # Bag.new :foo
|
52
|
+
# #
|
53
|
+
# def self.[] *args
|
54
|
+
# new *args
|
55
|
+
# end
|
56
|
+
# end
|
57
|
+
|
58
|
+
module BagMethods
|
59
|
+
module ClassMethods
|
60
|
+
#
|
61
|
+
# Pig type string --
|
62
|
+
# the pig type strings for each sub-element.
|
63
|
+
#
|
64
|
+
def typify
|
65
|
+
vars_str = members.zip(mtypes).map do |attr, mtype|
|
66
|
+
"%s: %s" % [attr, mtype.typify]
|
67
|
+
end
|
68
|
+
"{ #{vars_str.join(', ')} }"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
def self.included base
|
72
|
+
base.extend ClassMethods
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
class Bag < TypedStruct
|
77
|
+
def self.new *args
|
78
|
+
bag = super *args
|
79
|
+
bag.class_eval{ include BagMethods }
|
80
|
+
end
|
81
|
+
def self.[] *args
|
82
|
+
new *args
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module AndPig
|
5
|
+
|
6
|
+
#
|
7
|
+
# Load the main class definitions
|
8
|
+
#
|
9
|
+
def self.init_load
|
10
|
+
puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
#
|
17
|
+
# OK we're going to cheat here:
|
18
|
+
# just cat the file in, and treat it as a scalar
|
19
|
+
#
|
20
|
+
def load_scalar path
|
21
|
+
# var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
|
22
|
+
var = "636"
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
def count_distinct dest_rel, attr, group_by
|
28
|
+
distincted =
|
29
|
+
generate(temp_rel(dest_rel), attr).
|
30
|
+
distinct(temp_rel(dest_rel), :parallel => 10)
|
31
|
+
distincted.
|
32
|
+
group( temp_rel(dest_rel), group_by).
|
33
|
+
foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Group a relation into bins, and return the counts for each bin
|
38
|
+
# * dest_rel - Relation to store
|
39
|
+
# {bin,
|
40
|
+
#
|
41
|
+
def histogram dest_rel, bin_attr, bin_expr=nil
|
42
|
+
bin_expr ||= bin_attr
|
43
|
+
bin_name = "#{bin_attr}_bin"
|
44
|
+
binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
|
45
|
+
binned. group( temp_rel(dest_rel), :by => bin_name).
|
46
|
+
foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require 'wukong/and_pig/operators/evaluators'
|
2
|
+
require 'wukong/and_pig/operators/foreach'
|
3
|
+
require 'wukong/and_pig/operators/groupies'
|
4
|
+
require 'wukong/and_pig/operators/load_store'
|
5
|
+
require 'wukong/and_pig/operators/meta'
|
6
|
+
require 'wukong/and_pig/operators/relational'
|
7
|
+
require 'wukong/and_pig/operators/file_methods'
|
8
|
+
require 'wukong/and_pig/operators/compound'
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#
|
2
|
+
# The FOREACH relational operator
|
3
|
+
#
|
4
|
+
module Wukong
|
5
|
+
module AndPig
|
6
|
+
class PigVar
|
7
|
+
#
|
8
|
+
# Select all elements in the source relation that match on the selecting relation,
|
9
|
+
# creating a relation with the same type as the source relation.
|
10
|
+
#
|
11
|
+
# For example,
|
12
|
+
#
|
13
|
+
# PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
|
14
|
+
#
|
15
|
+
# returns a relation IsolatedCvals, whose type is identical to
|
16
|
+
# MyComplicatedValues' type, with only the elements having an id also
|
17
|
+
# presend in MyIds.
|
18
|
+
#
|
19
|
+
#
|
20
|
+
def self.isolate lval, on, on_field, from, from_field, options={ }
|
21
|
+
joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
|
22
|
+
isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
|
23
|
+
isolated.klass = from.klass
|
24
|
+
isolated
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Wukong
|
2
|
+
module AndPig
|
3
|
+
class PigVar
|
4
|
+
# ===========================================================================
|
5
|
+
#
|
6
|
+
# Pig expressions
|
7
|
+
#
|
8
|
+
|
9
|
+
#
|
10
|
+
def dfs cmd, filename
|
11
|
+
# note == no '' on path
|
12
|
+
self.class.emit "%-23s\t %s" % [cmd, filename]
|
13
|
+
end
|
14
|
+
#
|
15
|
+
# remove the stored file
|
16
|
+
#
|
17
|
+
def rmf! filename
|
18
|
+
dfs :rmf, filename
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
#
|
23
|
+
#
|
24
|
+
def mkdir filename
|
25
|
+
dfs :mkdir, filename
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#
|
2
|
+
# The FOREACH relational operator
|
3
|
+
#
|
4
|
+
module Wukong
|
5
|
+
module AndPig
|
6
|
+
class PigVar
|
7
|
+
|
8
|
+
# ===========================================================================
|
9
|
+
#
|
10
|
+
# FOREACH
|
11
|
+
#
|
12
|
+
def generate lval, *field_specs
|
13
|
+
gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
|
14
|
+
l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
|
15
|
+
l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
16
|
+
new_in_chain(lval, l_klass, l_cmd)
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# for a list of GENERATE args, we need
|
21
|
+
#
|
22
|
+
# * gen_clauses, the clause to stuff into the GENERATE line
|
23
|
+
# gen_expr AS gen_field_name: gen_field_type
|
24
|
+
#
|
25
|
+
# * new_types, the resulting types for each
|
26
|
+
#
|
27
|
+
# gen_expr common cases include
|
28
|
+
#
|
29
|
+
# field
|
30
|
+
# Rel::field
|
31
|
+
# Rel.(field)
|
32
|
+
# "ComplicatedExpression"
|
33
|
+
#
|
34
|
+
#
|
35
|
+
# field_attrs
|
36
|
+
#
|
37
|
+
#
|
38
|
+
def parse_gen_clause field_spec
|
39
|
+
case field_spec
|
40
|
+
when AS
|
41
|
+
field_spec
|
42
|
+
when Symbol
|
43
|
+
AS[field_spec, field_spec, field_type(field_spec)];
|
44
|
+
when Array
|
45
|
+
alias_in, field_in, name, type = field_spec
|
46
|
+
name ||= field_in
|
47
|
+
type = alias_in.field_type(field_in)
|
48
|
+
AS[field_in, name, type, alias_in.relationize]
|
49
|
+
when Hash
|
50
|
+
field_spec.map do |field_in, field_out|
|
51
|
+
AS[field_in, field_out, field_type(field_in)]
|
52
|
+
end
|
53
|
+
else raise "Don't know how to specify type for #{field_specs.inspect}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
# # when Array
|
67
|
+
# # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
|
68
|
+
# # field_expr, field_attr, field_type = field_spec
|
69
|
+
# # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
|
70
|
+
# # gen_clauses << "#{field_expr} AS #{field_as}"
|
71
|
+
# # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
|
72
|
+
|
73
|
+
# def prelimify *field_specs
|
74
|
+
# gen_clauses = []
|
75
|
+
# field_attrs = []
|
76
|
+
# field_specs.map do |field_spec|
|
77
|
+
# unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
|
78
|
+
# field_expr, field_attr = field_spec
|
79
|
+
# gen_clauses << "#{field_expr}"
|
80
|
+
# field_attrs += [field_attr].flatten
|
81
|
+
# end
|
82
|
+
# [ gen_clauses, field_attrs ]
|
83
|
+
# end
|
84
|
+
#
|
85
|
+
# # def generate *args
|
86
|
+
# # gen_clauses, field_attrs = self.class.fieldify *args
|
87
|
+
# # l_klass = TypedStruct.new(*field_attrs)
|
88
|
+
# # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
89
|
+
# # end
|
90
|
+
#
|
91
|
+
# def foreach *args
|
92
|
+
# generate_clause = args.pop
|
93
|
+
# prelim_exprs, prelim_attrs = prelimify *args
|
94
|
+
# prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
|
95
|
+
# gen_clauses, field_attrs = fieldify *generate_clause
|
96
|
+
# l_klass = TypedStruct.new(*field_attrs)
|
97
|
+
# new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
|
98
|
+
# end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# The FOREACH relational operator
|
4
|
+
#
|
5
|
+
module Wukong
|
6
|
+
module AndPig
|
7
|
+
class PigVar
|
8
|
+
|
9
|
+
#===========================================================================
|
10
|
+
#
|
11
|
+
# GROUP and COGROUP
|
12
|
+
#
|
13
|
+
|
14
|
+
#
|
15
|
+
# COGROUP - Groups the data in two or more relations.
|
16
|
+
#
|
17
|
+
# == Syntax
|
18
|
+
#
|
19
|
+
# alias = COGROUP alias1 BY field_alias [INNER | OUTER],
|
20
|
+
# aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
|
21
|
+
#
|
22
|
+
# == Structure
|
23
|
+
#
|
24
|
+
# { group, <structure of alias1>, <structure of alias2>, ... }
|
25
|
+
#
|
26
|
+
# == Terms
|
27
|
+
#
|
28
|
+
# * alias The name a relation.
|
29
|
+
#
|
30
|
+
# * field_alias The name of one or more fields in a relation. If multiple
|
31
|
+
# fields are specified, separate with commas and enclose
|
32
|
+
# in parentheses. For example, X = COGROUP A BY (f1, f2);
|
33
|
+
#
|
34
|
+
# The number of fields specified in each BY clause must
|
35
|
+
# match. For example, X = COGROUP A BY (a1,a2,a3), B BY
|
36
|
+
# (b1,b2,b3);
|
37
|
+
#
|
38
|
+
# * BY Keyword.
|
39
|
+
#
|
40
|
+
# * INNER Eliminate NULLs on that grouping
|
41
|
+
# * OUTER Do not eliminate NULLs on that grouping (default)
|
42
|
+
#
|
43
|
+
# * PARALLEL n -- Increase the parallelism of a job by specifying the
|
44
|
+
# number of reduce tasks, n. The optimal number of
|
45
|
+
# parallel tasks depends on the amount of memory on each
|
46
|
+
# node and the memory required by each of the tasks. To
|
47
|
+
# determine n, use the following as a general guideline:
|
48
|
+
#
|
49
|
+
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
50
|
+
#
|
51
|
+
# where nr_nodes is the number of nodes used and nr_GB is
|
52
|
+
# the amount of physical memory on each node.
|
53
|
+
#
|
54
|
+
# Note the following:
|
55
|
+
# - Parallel only affects the number of reduce tasks. Map
|
56
|
+
# parallelism is determined by the input file, one map
|
57
|
+
# for each HDFS block.
|
58
|
+
# - If you don’t specify parallel, you still get the same
|
59
|
+
# map parallelism but only one reduce task.
|
60
|
+
#
|
61
|
+
# == Usage
|
62
|
+
#
|
63
|
+
# The COGOUP operator groups the data in two or more relations based on
|
64
|
+
# the common field values.
|
65
|
+
#
|
66
|
+
# Note: The COGROUP and JOIN operators perform similar functions. COGROUP
|
67
|
+
# creates a nested set of output tuples while JOIN creates a flat set of
|
68
|
+
# output tuples with NULLs eliminated.
|
69
|
+
#
|
70
|
+
# == Examples
|
71
|
+
#
|
72
|
+
# Suppose we have two relations, A and B.
|
73
|
+
#
|
74
|
+
# A: (owner:chararray, pet:chararray)
|
75
|
+
# ---------------
|
76
|
+
# (Alice, cat)
|
77
|
+
# (Alice, goldfish)
|
78
|
+
# (Alice, turtle)
|
79
|
+
# (Bob, cat)
|
80
|
+
# (Bob, dog)
|
81
|
+
#
|
82
|
+
# B: (friend1:chararray, friend2:charrarray)
|
83
|
+
# ---------------------
|
84
|
+
# (Cindy, Alice)
|
85
|
+
# (Mark, Alice)
|
86
|
+
# (Paul, Bob)
|
87
|
+
# (Paul, Jane)
|
88
|
+
#
|
89
|
+
# In this example tuples are co-grouped using field “owner” from relation
|
90
|
+
# A and field “friend2” from relation B as the key fields. The DESCRIBE
|
91
|
+
# operator shows the schema for relation X, which has two fields, "group"
|
92
|
+
# and "A" (for an explanation, see GROUP).
|
93
|
+
#
|
94
|
+
# X = COGROUP A BY owner, B BY friend2;
|
95
|
+
# DESCRIBE X;
|
96
|
+
#
|
97
|
+
# X: {group: chararray,
|
98
|
+
# A: {owner: chararray,pet: chararray},
|
99
|
+
# B: {friend1: chararray,friend2: chararray}}
|
100
|
+
#
|
101
|
+
# Relation X looks like this. A tuple is created for each unique key
|
102
|
+
# field. The tuple includes the key field and two bags. The first bag is
|
103
|
+
# the tuples from the first relation with the matching key field. The
|
104
|
+
# second bag is the tuples from the second relation with the matching key
|
105
|
+
# field. If no tuples match the key field, the bag is empty.
|
106
|
+
#
|
107
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
108
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
109
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
110
|
+
# {(Paul, Bob)})
|
111
|
+
# (Jane, {},
|
112
|
+
# {(Paul, Jane)})
|
113
|
+
#
|
114
|
+
# In this example tuples are co-grouped and the INNER keyword is used to
|
115
|
+
# ensure that only bags with at least one tuple are returned.
|
116
|
+
#
|
117
|
+
# X = COGROUP A BY owner INNER, B BY friend2 INNER;
|
118
|
+
#
|
119
|
+
# Relation X looks like this.
|
120
|
+
#
|
121
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
122
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
123
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
124
|
+
# {(Paul, Bob)})
|
125
|
+
#
|
126
|
+
# In this example tuples are co-grouped and the INNER keyword is used
|
127
|
+
# asymmetrically on only one of the relations.
|
128
|
+
#
|
129
|
+
# X = COGROUP A BY owner, B BY friend2 INNER;
|
130
|
+
#
|
131
|
+
# Relation X looks like this.
|
132
|
+
#
|
133
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
134
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
135
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
136
|
+
# {(Paul, Bob)})
|
137
|
+
# (Jane, {},
|
138
|
+
# {(Paul, Jane)})
|
139
|
+
#
|
140
|
+
#
|
141
|
+
def group group_by
|
142
|
+
l_klass = l_klass_for_group group_by
|
143
|
+
by_clause = self.class.make_by_clause(group_by)
|
144
|
+
new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.make_by_clause by_spec
|
148
|
+
case by_spec
|
149
|
+
when Array then 'BY ' + by_spec.join(", ")
|
150
|
+
when :all then 'ALL'
|
151
|
+
when Symbol then "BY #{by_spec}"
|
152
|
+
when String then by_spec
|
153
|
+
when Hash then make_by_clause(by_spec[:by])
|
154
|
+
else raise "Don't know how to group on #{by_spec.inspect}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
def types_for_fields field
|
158
|
+
klass.members_types[field]
|
159
|
+
end
|
160
|
+
def l_klass_for_group group_by
|
161
|
+
self.class.l_klass_for_group group_by, self
|
162
|
+
end
|
163
|
+
def self.l_klass_for_group group_by, *rels
|
164
|
+
TypedStruct.new(
|
165
|
+
[:group, rels.first.types_for_fields(group_by)],
|
166
|
+
*rels.map{|rel| [rel.relation, rel.klass] }
|
167
|
+
)
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# COGROUP pig expression:
|
172
|
+
# UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
|
173
|
+
#
|
174
|
+
def self.cogroup lval, *by
|
175
|
+
by_clause = by.map do |relation, group_by, as|
|
176
|
+
"%s %s" % [relation.relation, make_by_clause(group_by)]
|
177
|
+
end.join(", ")
|
178
|
+
l_klass = l_klass_for_group by[0][1], *by.map(&:first)
|
179
|
+
rval = new l_klass, lval, "COGROUP #{by_clause}"
|
180
|
+
set lval, rval
|
181
|
+
end
|
182
|
+
|
183
|
+
def cogroup *args
|
184
|
+
self.class.cogroup self, *args
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
# ===========================================================================
|
189
|
+
#
|
190
|
+
# JOIN
|
191
|
+
#
|
192
|
+
def self.klass_from_join by
|
193
|
+
klasses = by.map(&:first)
|
194
|
+
TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
|
195
|
+
end
|
196
|
+
|
197
|
+
def self.join_by_clause by
|
198
|
+
by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
|
199
|
+
end
|
200
|
+
|
201
|
+
def self.join lval, by
|
202
|
+
parallel = by.delete(:parallel)
|
203
|
+
cmd = "JOIN " + join_by_clause(by)
|
204
|
+
parallelize! cmd, :parallel => parallel
|
205
|
+
l_klass = klass_from_join(by)
|
206
|
+
rval = new(l_klass, lval, cmd)
|
207
|
+
set lval, rval
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|