wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require 'wukong/and_pig/generate/variable_inflections'
|
|
2
|
+
|
|
3
|
+
module Wukong
|
|
4
|
+
module AndPig
|
|
5
|
+
|
|
6
|
+
mattr_accessor :comments
|
|
7
|
+
self.comments = true
|
|
8
|
+
# send output to stdout or to captured pig instance
|
|
9
|
+
mattr_accessor :emit_dest
|
|
10
|
+
# full pathname to the pig executable
|
|
11
|
+
PIG_EXECUTABLE = '/usr/local/bin/pig'
|
|
12
|
+
|
|
13
|
+
def self.finish
|
|
14
|
+
PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
# All the embarrassing magick to pretend ruby symbols are pig relations
|
|
19
|
+
#
|
|
20
|
+
class PigVar
|
|
21
|
+
|
|
22
|
+
# Output a command
|
|
23
|
+
def self.emit cmd, semicolon=true
|
|
24
|
+
cmd = cmd + ' ;' if semicolon
|
|
25
|
+
case Wukong::AndPig.emit_dest
|
|
26
|
+
when :captured
|
|
27
|
+
pig_in_poke.puts(cmd)
|
|
28
|
+
pig_in_poke.flush
|
|
29
|
+
puts pig_in_poke.gets
|
|
30
|
+
else
|
|
31
|
+
puts(cmd)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# generate the code
|
|
36
|
+
def self.emit_setter relation, rval
|
|
37
|
+
emit "%-23s\t= %s" % [relation, rval.cmd]
|
|
38
|
+
rval
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# generate the code
|
|
42
|
+
def self.emit_imperative imperative, *rest
|
|
43
|
+
cmd_part = "%-14s \t" % imperative
|
|
44
|
+
arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
|
|
45
|
+
emit cmd_part+arg_part
|
|
46
|
+
rest.first
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.pig_in_poke
|
|
50
|
+
return @pig_in_poke if @pig_in_poke
|
|
51
|
+
case Wukong::AndPig.emit_dest
|
|
52
|
+
when :captured
|
|
53
|
+
@pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
|
|
54
|
+
@pig_in_poke.sync = true
|
|
55
|
+
@pig_in_poke
|
|
56
|
+
else @pig_in_poke = $stdout
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
#
|
|
61
|
+
# Reset the captured pig instance
|
|
62
|
+
#
|
|
63
|
+
def self.reset_pig_in_poke!
|
|
64
|
+
begin pig_in_poke.close ; rescue nil ; end
|
|
65
|
+
@pig_in_poke = nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def set!
|
|
69
|
+
self.class.emit_setter(relation, self)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
#
|
|
73
|
+
# Emit a comment
|
|
74
|
+
# skips if Wukong::AndPig.comments is false
|
|
75
|
+
#
|
|
76
|
+
def self.rem comment
|
|
77
|
+
return unless Wukong::AndPig.comments
|
|
78
|
+
PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
String.class_eval do
|
|
2
|
+
#
|
|
3
|
+
# Generate relation name from a handle
|
|
4
|
+
#
|
|
5
|
+
def relationize() camelize end
|
|
6
|
+
end
|
|
7
|
+
Symbol.class_eval do
|
|
8
|
+
#
|
|
9
|
+
# Generate relation name from a handle
|
|
10
|
+
#
|
|
11
|
+
def relationize
|
|
12
|
+
to_s.relationize
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
Object.class_eval do
|
|
17
|
+
def typify() self.class ; end
|
|
18
|
+
|
|
19
|
+
def symbolize
|
|
20
|
+
self.to_s.underscore.gsub(%r{.*/}, '').to_sym
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
class << Integer ; def typify() 'int' end ; end
|
|
25
|
+
class << Bignum ; def typify() 'long' end ; end
|
|
26
|
+
class << Float ; def typify() 'float' end ; end
|
|
27
|
+
class << String ; def typify() 'chararray' end ; end
|
|
28
|
+
class << Symbol ; def typify() self end ; end
|
|
29
|
+
class << Date ; def typify() 'long' end ; end
|
|
30
|
+
|
|
31
|
+
# Array.class_eval do
|
|
32
|
+
# def typify()
|
|
33
|
+
# "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
|
|
34
|
+
# end
|
|
35
|
+
# end
|
|
36
|
+
# class Tuple
|
|
37
|
+
# attr_accessor :contents
|
|
38
|
+
# def initialize *args
|
|
39
|
+
# self.contents = args
|
|
40
|
+
# end
|
|
41
|
+
# def typify
|
|
42
|
+
# "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
|
|
43
|
+
# end
|
|
44
|
+
# #
|
|
45
|
+
# # Sugar for creating a new bag. The following are equivalent:
|
|
46
|
+
# #
|
|
47
|
+
# # Bag[:foo]
|
|
48
|
+
# # Bag.new :foo
|
|
49
|
+
# #
|
|
50
|
+
# def self.[] *args
|
|
51
|
+
# new *args
|
|
52
|
+
# end
|
|
53
|
+
# end
|
|
54
|
+
|
|
55
|
+
module BagMethods
|
|
56
|
+
module ClassMethods
|
|
57
|
+
#
|
|
58
|
+
# Pig type string --
|
|
59
|
+
# the pig type strings for each sub-element.
|
|
60
|
+
#
|
|
61
|
+
def typify
|
|
62
|
+
vars_str = members.zip(mtypes).map do |attr, mtype|
|
|
63
|
+
"%s: %s" % [attr, mtype.typify]
|
|
64
|
+
end
|
|
65
|
+
"{ #{vars_str.join(', ')} }"
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
def self.included base
|
|
69
|
+
base.extend ClassMethods
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
class Bag < TypedStruct
|
|
74
|
+
def self.new *args
|
|
75
|
+
bag = super *args
|
|
76
|
+
bag.class_eval{ include BagMethods }
|
|
77
|
+
end
|
|
78
|
+
def self.[] *args
|
|
79
|
+
new *args
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
module Wukong
|
|
4
|
+
module AndPig
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# Load the main class definitions
|
|
8
|
+
#
|
|
9
|
+
def self.init_load
|
|
10
|
+
puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# OK we're going to cheat here:
|
|
18
|
+
# just cat the file in, and treat it as a scalar
|
|
19
|
+
#
|
|
20
|
+
def load_scalar path
|
|
21
|
+
# var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
|
|
22
|
+
var = "636"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def count_distinct dest_rel, attr, group_by
|
|
28
|
+
distincted =
|
|
29
|
+
generate(temp_rel(dest_rel), attr).
|
|
30
|
+
distinct(temp_rel(dest_rel), :parallel => 10)
|
|
31
|
+
distincted.
|
|
32
|
+
group( temp_rel(dest_rel), group_by).
|
|
33
|
+
foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# Group a relation into bins, and return the counts for each bin
|
|
38
|
+
# * dest_rel - Relation to store
|
|
39
|
+
# {bin,
|
|
40
|
+
#
|
|
41
|
+
def histogram dest_rel, bin_attr, bin_expr=nil
|
|
42
|
+
bin_expr ||= bin_attr
|
|
43
|
+
bin_name = "#{bin_attr}_bin"
|
|
44
|
+
binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
|
|
45
|
+
binned. group( temp_rel(dest_rel), :by => bin_name).
|
|
46
|
+
foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
require 'wukong/and_pig/operators/evaluators'
|
|
2
|
+
require 'wukong/and_pig/operators/foreach'
|
|
3
|
+
require 'wukong/and_pig/operators/groupies'
|
|
4
|
+
require 'wukong/and_pig/operators/load_store'
|
|
5
|
+
require 'wukong/and_pig/operators/meta'
|
|
6
|
+
require 'wukong/and_pig/operators/relational'
|
|
7
|
+
require 'wukong/and_pig/operators/file_methods'
|
|
8
|
+
require 'wukong/and_pig/operators/compound'
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#
|
|
2
|
+
# The FOREACH relational operator
|
|
3
|
+
#
|
|
4
|
+
module Wukong
|
|
5
|
+
module AndPig
|
|
6
|
+
class PigVar
|
|
7
|
+
#
|
|
8
|
+
# Select all elements in the source relation that match on the selecting relation,
|
|
9
|
+
# creating a relation with the same type as the source relation.
|
|
10
|
+
#
|
|
11
|
+
# For example,
|
|
12
|
+
#
|
|
13
|
+
# PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
|
|
14
|
+
#
|
|
15
|
+
# returns a relation IsolatedCvals, whose type is identical to
|
|
16
|
+
# MyComplicatedValues' type, with only the elements having an id also
|
|
17
|
+
# presend in MyIds.
|
|
18
|
+
#
|
|
19
|
+
#
|
|
20
|
+
def self.isolate lval, on, on_field, from, from_field, options={ }
|
|
21
|
+
joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
|
|
22
|
+
isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
|
|
23
|
+
isolated.klass = from.klass
|
|
24
|
+
isolated
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module Wukong
|
|
2
|
+
module AndPig
|
|
3
|
+
class PigVar
|
|
4
|
+
# ===========================================================================
|
|
5
|
+
#
|
|
6
|
+
# Pig expressions
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
def dfs cmd, filename
|
|
11
|
+
# note == no '' on path
|
|
12
|
+
self.class.emit "%-23s\t %s" % [cmd, filename]
|
|
13
|
+
end
|
|
14
|
+
#
|
|
15
|
+
# remove the stored file
|
|
16
|
+
#
|
|
17
|
+
def rmf! filename
|
|
18
|
+
dfs :rmf, filename
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
#
|
|
22
|
+
#
|
|
23
|
+
#
|
|
24
|
+
def mkdir filename
|
|
25
|
+
dfs :mkdir, filename
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#
|
|
2
|
+
# The FOREACH relational operator
|
|
3
|
+
#
|
|
4
|
+
module Wukong
|
|
5
|
+
module AndPig
|
|
6
|
+
class PigVar
|
|
7
|
+
|
|
8
|
+
# ===========================================================================
|
|
9
|
+
#
|
|
10
|
+
# FOREACH
|
|
11
|
+
#
|
|
12
|
+
def generate lval, *field_specs
|
|
13
|
+
gen_clauses = field_specs.map{|field_spec| parse_gen_clause(field_spec)}.flatten
|
|
14
|
+
l_klass = TypedStruct.new(* gen_clauses.map(&:name_type))
|
|
15
|
+
l_cmd = "FOREACH #{self.relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
|
16
|
+
new_in_chain(lval, l_klass, l_cmd)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#
|
|
20
|
+
# for a list of GENERATE args, we need
|
|
21
|
+
#
|
|
22
|
+
# * gen_clauses, the clause to stuff into the GENERATE line
|
|
23
|
+
# gen_expr AS gen_field_name: gen_field_type
|
|
24
|
+
#
|
|
25
|
+
# * new_types, the resulting types for each
|
|
26
|
+
#
|
|
27
|
+
# gen_expr common cases include
|
|
28
|
+
#
|
|
29
|
+
# field
|
|
30
|
+
# Rel::field
|
|
31
|
+
# Rel.(field)
|
|
32
|
+
# "ComplicatedExpression"
|
|
33
|
+
#
|
|
34
|
+
#
|
|
35
|
+
# field_attrs
|
|
36
|
+
#
|
|
37
|
+
#
|
|
38
|
+
def parse_gen_clause field_spec
|
|
39
|
+
case field_spec
|
|
40
|
+
when AS
|
|
41
|
+
field_spec
|
|
42
|
+
when Symbol
|
|
43
|
+
AS[field_spec, field_spec, field_type(field_spec)];
|
|
44
|
+
when Array
|
|
45
|
+
alias_in, field_in, name, type = field_spec
|
|
46
|
+
name ||= field_in
|
|
47
|
+
type = alias_in.field_type(field_in)
|
|
48
|
+
AS[field_in, name, type, alias_in.relationize]
|
|
49
|
+
when Hash
|
|
50
|
+
field_spec.map do |field_in, field_out|
|
|
51
|
+
AS[field_in, field_out, field_type(field_in)]
|
|
52
|
+
end
|
|
53
|
+
else raise "Don't know how to specify type for #{field_specs.inspect}"
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# # when Array
|
|
67
|
+
# # unless [2,3].include?(field_spec.length) then raise "Complex fields must be (field_spec, as_name) or (field_spec, as_name, as_type)" end
|
|
68
|
+
# # field_expr, field_attr, field_type = field_spec
|
|
69
|
+
# # field_as = field_attr.is_a?(Array) ? "(#{field_attr.join(", ")})" : field_attr
|
|
70
|
+
# # gen_clauses << "#{field_expr} AS #{field_as}"
|
|
71
|
+
# # field_attrs << [field_attr, field_type || klass.members_types[field_expr]]
|
|
72
|
+
|
|
73
|
+
# def prelimify *field_specs
|
|
74
|
+
# gen_clauses = []
|
|
75
|
+
# field_attrs = []
|
|
76
|
+
# field_specs.map do |field_spec|
|
|
77
|
+
# unless field_spec.length == 2 then raise "Complex fields must be a pair (field_spec, as_name)" end
|
|
78
|
+
# field_expr, field_attr = field_spec
|
|
79
|
+
# gen_clauses << "#{field_expr}"
|
|
80
|
+
# field_attrs += [field_attr].flatten
|
|
81
|
+
# end
|
|
82
|
+
# [ gen_clauses, field_attrs ]
|
|
83
|
+
# end
|
|
84
|
+
#
|
|
85
|
+
# # def generate *args
|
|
86
|
+
# # gen_clauses, field_attrs = self.class.fieldify *args
|
|
87
|
+
# # l_klass = TypedStruct.new(*field_attrs)
|
|
88
|
+
# # new_in_chain l_klass, "FOREACH #{relation} GENERATE\n #{gen_clauses.join(",\n ")}"
|
|
89
|
+
# # end
|
|
90
|
+
#
|
|
91
|
+
# def foreach *args
|
|
92
|
+
# generate_clause = args.pop
|
|
93
|
+
# prelim_exprs, prelim_attrs = prelimify *args
|
|
94
|
+
# prelims = prelim_exprs.zip(prelim_attrs).map{|e,a| "#{a} = #{e}" }.join(";\n ")+";"
|
|
95
|
+
# gen_clauses, field_attrs = fieldify *generate_clause
|
|
96
|
+
# l_klass = TypedStruct.new(*field_attrs)
|
|
97
|
+
# new_in_chain l_klass, %Q{FOREACH #{relation} {\n #{prelims}\n GENERATE\n #{gen_clauses.join(",\n ")} ; } }
|
|
98
|
+
# end
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# The FOREACH relational operator
|
|
4
|
+
#
|
|
5
|
+
module Wukong
|
|
6
|
+
module AndPig
|
|
7
|
+
class PigVar
|
|
8
|
+
|
|
9
|
+
#===========================================================================
|
|
10
|
+
#
|
|
11
|
+
# GROUP and COGROUP
|
|
12
|
+
#
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# COGROUP - Groups the data in two or more relations.
|
|
16
|
+
#
|
|
17
|
+
# == Syntax
|
|
18
|
+
#
|
|
19
|
+
# alias = COGROUP alias1 BY field_alias [INNER | OUTER],
|
|
20
|
+
# aliasN BY field_alias [INNER | OUTER] [PARALLEL n] ;
|
|
21
|
+
#
|
|
22
|
+
# == Structure
|
|
23
|
+
#
|
|
24
|
+
# { group, <structure of alias1>, <structure of alias2>, ... }
|
|
25
|
+
#
|
|
26
|
+
# == Terms
|
|
27
|
+
#
|
|
28
|
+
# * alias The name a relation.
|
|
29
|
+
#
|
|
30
|
+
# * field_alias The name of one or more fields in a relation. If multiple
|
|
31
|
+
# fields are specified, separate with commas and enclose
|
|
32
|
+
# in parentheses. For example, X = COGROUP A BY (f1, f2);
|
|
33
|
+
#
|
|
34
|
+
# The number of fields specified in each BY clause must
|
|
35
|
+
# match. For example, X = COGROUP A BY (a1,a2,a3), B BY
|
|
36
|
+
# (b1,b2,b3);
|
|
37
|
+
#
|
|
38
|
+
# * BY Keyword.
|
|
39
|
+
#
|
|
40
|
+
# * INNER Eliminate NULLs on that grouping
|
|
41
|
+
# * OUTER Do not eliminate NULLs on that grouping (default)
|
|
42
|
+
#
|
|
43
|
+
# * PARALLEL n -- Increase the parallelism of a job by specifying the
|
|
44
|
+
# number of reduce tasks, n. The optimal number of
|
|
45
|
+
# parallel tasks depends on the amount of memory on each
|
|
46
|
+
# node and the memory required by each of the tasks. To
|
|
47
|
+
# determine n, use the following as a general guideline:
|
|
48
|
+
#
|
|
49
|
+
# n = (nr_nodes - 1) * 0.45 * nr_GB
|
|
50
|
+
#
|
|
51
|
+
# where nr_nodes is the number of nodes used and nr_GB is
|
|
52
|
+
# the amount of physical memory on each node.
|
|
53
|
+
#
|
|
54
|
+
# Note the following:
|
|
55
|
+
# - Parallel only affects the number of reduce tasks. Map
|
|
56
|
+
# parallelism is determined by the input file, one map
|
|
57
|
+
# for each HDFS block.
|
|
58
|
+
# - If you don’t specify parallel, you still get the same
|
|
59
|
+
# map parallelism but only one reduce task.
|
|
60
|
+
#
|
|
61
|
+
# == Usage
|
|
62
|
+
#
|
|
63
|
+
# The COGOUP operator groups the data in two or more relations based on
|
|
64
|
+
# the common field values.
|
|
65
|
+
#
|
|
66
|
+
# Note: The COGROUP and JOIN operators perform similar functions. COGROUP
|
|
67
|
+
# creates a nested set of output tuples while JOIN creates a flat set of
|
|
68
|
+
# output tuples with NULLs eliminated.
|
|
69
|
+
#
|
|
70
|
+
# == Examples
|
|
71
|
+
#
|
|
72
|
+
# Suppose we have two relations, A and B.
|
|
73
|
+
#
|
|
74
|
+
# A: (owner:chararray, pet:chararray)
|
|
75
|
+
# ---------------
|
|
76
|
+
# (Alice, cat)
|
|
77
|
+
# (Alice, goldfish)
|
|
78
|
+
# (Alice, turtle)
|
|
79
|
+
# (Bob, cat)
|
|
80
|
+
# (Bob, dog)
|
|
81
|
+
#
|
|
82
|
+
# B: (friend1:chararray, friend2:charrarray)
|
|
83
|
+
# ---------------------
|
|
84
|
+
# (Cindy, Alice)
|
|
85
|
+
# (Mark, Alice)
|
|
86
|
+
# (Paul, Bob)
|
|
87
|
+
# (Paul, Jane)
|
|
88
|
+
#
|
|
89
|
+
# In this example tuples are co-grouped using field “owner” from relation
|
|
90
|
+
# A and field “friend2” from relation B as the key fields. The DESCRIBE
|
|
91
|
+
# operator shows the schema for relation X, which has two fields, "group"
|
|
92
|
+
# and "A" (for an explanation, see GROUP).
|
|
93
|
+
#
|
|
94
|
+
# X = COGROUP A BY owner, B BY friend2;
|
|
95
|
+
# DESCRIBE X;
|
|
96
|
+
#
|
|
97
|
+
# X: {group: chararray,
|
|
98
|
+
# A: {owner: chararray,pet: chararray},
|
|
99
|
+
# B: {friend1: chararray,friend2: chararray}}
|
|
100
|
+
#
|
|
101
|
+
# Relation X looks like this. A tuple is created for each unique key
|
|
102
|
+
# field. The tuple includes the key field and two bags. The first bag is
|
|
103
|
+
# the tuples from the first relation with the matching key field. The
|
|
104
|
+
# second bag is the tuples from the second relation with the matching key
|
|
105
|
+
# field. If no tuples match the key field, the bag is empty.
|
|
106
|
+
#
|
|
107
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
|
108
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
|
109
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
|
110
|
+
# {(Paul, Bob)})
|
|
111
|
+
# (Jane, {},
|
|
112
|
+
# {(Paul, Jane)})
|
|
113
|
+
#
|
|
114
|
+
# In this example tuples are co-grouped and the INNER keyword is used to
|
|
115
|
+
# ensure that only bags with at least one tuple are returned.
|
|
116
|
+
#
|
|
117
|
+
# X = COGROUP A BY owner INNER, B BY friend2 INNER;
|
|
118
|
+
#
|
|
119
|
+
# Relation X looks like this.
|
|
120
|
+
#
|
|
121
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
|
122
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
|
123
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
|
124
|
+
# {(Paul, Bob)})
|
|
125
|
+
#
|
|
126
|
+
# In this example tuples are co-grouped and the INNER keyword is used
|
|
127
|
+
# asymmetrically on only one of the relations.
|
|
128
|
+
#
|
|
129
|
+
# X = COGROUP A BY owner, B BY friend2 INNER;
|
|
130
|
+
#
|
|
131
|
+
# Relation X looks like this.
|
|
132
|
+
#
|
|
133
|
+
# (Alice, {(Alice, turtle), (Alice, goldfish), (Alice, cat)},
|
|
134
|
+
# {(Cindy, Alice), (Mark, Alice)})
|
|
135
|
+
# (Bob, {(Bob, dog), (Bob, cat)},
|
|
136
|
+
# {(Paul, Bob)})
|
|
137
|
+
# (Jane, {},
|
|
138
|
+
# {(Paul, Jane)})
|
|
139
|
+
#
|
|
140
|
+
#
|
|
141
|
+
def group group_by
|
|
142
|
+
l_klass = l_klass_for_group group_by
|
|
143
|
+
by_clause = self.class.make_by_clause(group_by)
|
|
144
|
+
new_in_chain anon, l_klass, "GROUP #{relation} #{by_clause}"
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def self.make_by_clause by_spec
|
|
148
|
+
case by_spec
|
|
149
|
+
when Array then 'BY ' + by_spec.join(", ")
|
|
150
|
+
when :all then 'ALL'
|
|
151
|
+
when Symbol then "BY #{by_spec}"
|
|
152
|
+
when String then by_spec
|
|
153
|
+
when Hash then make_by_clause(by_spec[:by])
|
|
154
|
+
else raise "Don't know how to group on #{by_spec.inspect}"
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
def types_for_fields field
|
|
158
|
+
klass.members_types[field]
|
|
159
|
+
end
|
|
160
|
+
def l_klass_for_group group_by
|
|
161
|
+
self.class.l_klass_for_group group_by, self
|
|
162
|
+
end
|
|
163
|
+
def self.l_klass_for_group group_by, *rels
|
|
164
|
+
TypedStruct.new(
|
|
165
|
+
[:group, rels.first.types_for_fields(group_by)],
|
|
166
|
+
*rels.map{|rel| [rel.relation, rel.klass] }
|
|
167
|
+
)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
#
|
|
171
|
+
# COGROUP pig expression:
|
|
172
|
+
# UserPosts = COGROUP Posts BY user_id, Users BY user_id ;
|
|
173
|
+
#
|
|
174
|
+
def self.cogroup lval, *by
|
|
175
|
+
by_clause = by.map do |relation, group_by, as|
|
|
176
|
+
"%s %s" % [relation.relation, make_by_clause(group_by)]
|
|
177
|
+
end.join(", ")
|
|
178
|
+
l_klass = l_klass_for_group by[0][1], *by.map(&:first)
|
|
179
|
+
rval = new l_klass, lval, "COGROUP #{by_clause}"
|
|
180
|
+
set lval, rval
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def cogroup *args
|
|
184
|
+
self.class.cogroup self, *args
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ===========================================================================
|
|
189
|
+
#
|
|
190
|
+
# JOIN
|
|
191
|
+
#
|
|
192
|
+
def self.klass_from_join by
|
|
193
|
+
klasses = by.map(&:first)
|
|
194
|
+
TypedStruct.new(*klasses.zip(klasses.map(&:klass)))
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def self.join_by_clause by
|
|
198
|
+
by.map{|rel, field| "#{rel.relationize} BY #{field}" }.join(", ")
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def self.join lval, by
|
|
202
|
+
parallel = by.delete(:parallel)
|
|
203
|
+
cmd = "JOIN " + join_by_clause(by)
|
|
204
|
+
parallelize! cmd, :parallel => parallel
|
|
205
|
+
l_klass = klass_from_join(by)
|
|
206
|
+
rval = new(l_klass, lval, cmd)
|
|
207
|
+
set lval, rval
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|