wukong 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
data/examples/run_all.sh
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
src_path="tmp/README.textile"
|
|
4
|
+
out_root="tmp/test"
|
|
5
|
+
hdp_opts="--map_tasks=1 --reduce_tasks=1"
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------------------------------------------
|
|
8
|
+
#
|
|
9
|
+
# Set up directories and copy over sample input
|
|
10
|
+
#
|
|
11
|
+
|
|
12
|
+
# hdp-rm ${src_path}
|
|
13
|
+
# hdp-put `dirname $0`/../README.textile tmp/
|
|
14
|
+
# hdp-mkdir $out_root
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
#
|
|
18
|
+
# Run scripts
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
cmd="word_count"
|
|
22
|
+
# hdp-rm -r ${out_root}/${cmd}
|
|
23
|
+
# ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
|
|
24
|
+
# hdp-catd ${out_root}/${cmd} | head -n 20
|
|
25
|
+
word_count=${out_root}/${cmd}
|
|
26
|
+
|
|
27
|
+
cmd="sample_records"
|
|
28
|
+
# hdp-rm -r ${out_root}/${cmd}
|
|
29
|
+
# ./examples/${cmd}.rb --sampling_fraction=0.8 \
|
|
30
|
+
# --run $hdp_opts $src_path ${out_root}/${cmd}
|
|
31
|
+
# hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
|
|
32
|
+
sample_records=${out_root}/${cmd}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# cmd="size"
|
|
36
|
+
# hdp-rm -r ${out_root}/${cmd}
|
|
37
|
+
# ./examples/${cmd}.rb --run $hdp_opts $src_path ${out_root}/${cmd}
|
|
38
|
+
# hdp-catd ${out_root}/${cmd}
|
|
39
|
+
# size=${out_root}/${cmd}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
cmd="count_keys"
|
|
43
|
+
hdp-rm -r ${out_root}/${cmd}
|
|
44
|
+
./examples/${cmd}.rb --run $hdp_opts $word_count ${out_root}/${cmd}
|
|
45
|
+
hdp-catd ${out_root}/${cmd} | head -n 200 | tail -n 20
|
|
46
|
+
count_keys=${out_root}/${cmd}
|
|
47
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# Probabilistically emit some fraction of record/lines
|
|
7
|
+
#
|
|
8
|
+
# Set the sampling fraction at the command line using the
|
|
9
|
+
# --sampling_fraction=
|
|
10
|
+
# option: for example, to take a random 1/1000th of the lines in huge_files,
|
|
11
|
+
# ./examples/sample_records.rb --sampling_fraction=0.001 --go huge_files sampled_files
|
|
12
|
+
#
|
|
13
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
|
14
|
+
include Wukong::Streamer::Filter
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
# floating-point number between 0 and 1 giving the fraction of lines to emit:
|
|
18
|
+
# at sampling_fraction=1 all records are emitted, at 0 none are.
|
|
19
|
+
#
|
|
20
|
+
# Takes its value from a mandatory command-line option
|
|
21
|
+
#
|
|
22
|
+
def sampling_fraction
|
|
23
|
+
@sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
|
|
24
|
+
raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
#
|
|
28
|
+
# randomly decide to emit +sampling_fraction+ fraction of lines
|
|
29
|
+
#
|
|
30
|
+
def emit? line
|
|
31
|
+
rand < self.sampling_fraction
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class Script < Wukong::Script
|
|
36
|
+
def default_options
|
|
37
|
+
super.merge :reduce_tasks => 0
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# Executes the script
|
|
43
|
+
#
|
|
44
|
+
Script.new( Mapper, nil ).run
|
data/examples/size.rb
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
module Size
|
|
6
|
+
#
|
|
7
|
+
# Feed the entire dataset through wc and sum the results
|
|
8
|
+
#
|
|
9
|
+
class Script < Wukong::Script
|
|
10
|
+
#
|
|
11
|
+
# Don't implement a wukong script to do something if there's a unix command
|
|
12
|
+
# that does it faster: just override map_command or reduce_command in your
|
|
13
|
+
# subclass of Wukong::Script to return the complete command line
|
|
14
|
+
#
|
|
15
|
+
def map_command
|
|
16
|
+
'/usr/bin/wc'
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Make all records go to one reducer
|
|
20
|
+
def default_options
|
|
21
|
+
super.merge :reduce_tasks => 1
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
#
|
|
26
|
+
# Sums the numeric value of each column in its input
|
|
27
|
+
#
|
|
28
|
+
class Reducer < Wukong::Streamer::Base
|
|
29
|
+
attr_accessor :sums
|
|
30
|
+
|
|
31
|
+
#
|
|
32
|
+
# The unix +wc+ command uses whitespace, not tabs, so we'll recordize
|
|
33
|
+
# accordingly.
|
|
34
|
+
#
|
|
35
|
+
def recordize line
|
|
36
|
+
line.strip.split(/\s+/)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#
|
|
40
|
+
# add each corresponding column in the input
|
|
41
|
+
#
|
|
42
|
+
def process *vals
|
|
43
|
+
self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
#
|
|
47
|
+
# run through the whole reduction input and then output the total
|
|
48
|
+
#
|
|
49
|
+
def stream *args
|
|
50
|
+
super *args
|
|
51
|
+
emit sums
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Execute the script
|
|
57
|
+
Size::Script.new(
|
|
58
|
+
nil,
|
|
59
|
+
Size::Reducer
|
|
60
|
+
).run
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
|
3
|
+
require 'wukong'
|
|
4
|
+
|
|
5
|
+
module WordCount
|
|
6
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
|
7
|
+
#
|
|
8
|
+
# Split a string into its constituent words.
|
|
9
|
+
#
|
|
10
|
+
# This is pretty simpleminded:
|
|
11
|
+
# * downcase the word
|
|
12
|
+
# * Split at any non-alphanumeric boundary, including '_'
|
|
13
|
+
# * However, preserve the special cases of 's or 't at the end of a
|
|
14
|
+
# word.
|
|
15
|
+
#
|
|
16
|
+
# tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
|
|
17
|
+
# # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
|
|
18
|
+
#
|
|
19
|
+
def tokenize str
|
|
20
|
+
return [] unless str
|
|
21
|
+
str = str.downcase;
|
|
22
|
+
# kill off all punctuation except [stuff]'s or [stuff]'t
|
|
23
|
+
# this includes hyphens (words are split)
|
|
24
|
+
str = str.
|
|
25
|
+
gsub(/[^a-zA-Z0-9\']+/, ' ').
|
|
26
|
+
gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
|
|
27
|
+
# Busticate at whitespace
|
|
28
|
+
words = str.strip.split(/\s+/)
|
|
29
|
+
words.reject!{|w| w.blank? }
|
|
30
|
+
words
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#
|
|
34
|
+
# Emit each word in each line.
|
|
35
|
+
#
|
|
36
|
+
def process line
|
|
37
|
+
tokenize(line).each{|word| yield [word, 1] }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# Accumulate the sum record-by-record:
|
|
43
|
+
#
|
|
44
|
+
class Reducer0 < Wukong::Streamer::Base
|
|
45
|
+
attr_accessor :key_count
|
|
46
|
+
def process word, count
|
|
47
|
+
@last_word ||= word
|
|
48
|
+
if (@last_word == word)
|
|
49
|
+
self.key_count += 1
|
|
50
|
+
else
|
|
51
|
+
yield [ @last_word, key_count ]
|
|
52
|
+
@last_word = word
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
def stream
|
|
56
|
+
emit @last_word, key_count
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
#
|
|
61
|
+
# You can stack up all the values in a list then sum them at once:
|
|
62
|
+
#
|
|
63
|
+
require 'active_support/core_ext/enumerable'
|
|
64
|
+
class Reducer1 < Wukong::Streamer::ListReducer
|
|
65
|
+
def finalize
|
|
66
|
+
yield [ key, values.map(&:last).map(&:to_i).sum ]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
#
|
|
71
|
+
# A bit kinder to your memory manager: accumulate the sum record-by-record:
|
|
72
|
+
#
|
|
73
|
+
class Reducer2 < Wukong::Streamer::AccumulatingReducer
|
|
74
|
+
attr_accessor :key_count
|
|
75
|
+
def start!(*args) self.key_count = 0 end
|
|
76
|
+
def accumulate(*args) self.key_count += 1 end
|
|
77
|
+
def finalize
|
|
78
|
+
yield [ key, key_count ]
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
#
|
|
83
|
+
# ... easiest of all, though: this is common enough that it's already included
|
|
84
|
+
#
|
|
85
|
+
require 'wukong/streamer/count_keys'
|
|
86
|
+
class Reducer3 < Wukong::Streamer::CountKeys
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Execute the script
|
|
92
|
+
Wukong::Script.new(
|
|
93
|
+
WordCount::Mapper,
|
|
94
|
+
WordCount::Reducer1
|
|
95
|
+
).run
|
data/lib/wukong.rb
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
require 'wukong/boot'
|
|
2
|
+
require 'wukong/extensions'
|
|
3
|
+
require 'wukong/datatypes'
|
|
4
|
+
require 'wukong/logger'
|
|
5
|
+
require 'wukong/bad_record'
|
|
6
|
+
autoload :TypedStruct, 'wukong/typed_struct'
|
|
7
|
+
module Wukong
|
|
8
|
+
autoload :Dfs, 'wukong/dfs'
|
|
9
|
+
autoload :Script, 'wukong/script'
|
|
10
|
+
autoload :Streamer, 'wukong/streamer'
|
|
11
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
require 'wukong/and_pig/pig_var'
|
|
2
|
+
require 'wukong/and_pig/as'
|
|
3
|
+
require 'wukong/and_pig/functions'
|
|
4
|
+
require 'wukong/and_pig/operators'
|
|
5
|
+
require 'wukong/and_pig/data_types'
|
|
6
|
+
require 'wukong/and_pig/pig_struct'
|
|
7
|
+
require 'wukong/and_pig/generate'
|
|
8
|
+
require 'wukong/and_pig/symbol'
|
|
9
|
+
require 'wukong/and_pig/utils'
|
|
10
|
+
|
|
11
|
+
module Wukong
|
|
12
|
+
#
|
|
13
|
+
# Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
|
|
14
|
+
# code from within ruby (and interactively, from the +irb+ console).
|
|
15
|
+
#
|
|
16
|
+
# It uses the same typed structures you've defined for Wukong to create
|
|
17
|
+
# pig-types aware commands. For example, the Wukong class
|
|
18
|
+
#
|
|
19
|
+
# class Customer < TypedStruct.new( [:id, Integer],
|
|
20
|
+
# [:name, String], [:postal_code, Integer], [:balance, Float] )
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
# will generate a LOAD command for pig as
|
|
24
|
+
#
|
|
25
|
+
# Customer1.pig_load('q4_reports/customers.tsv').set!
|
|
26
|
+
# # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
|
|
27
|
+
# AS (id: int, name: chararray, postal_code: int, balance: float) ;
|
|
28
|
+
#
|
|
29
|
+
# You can write anonymous chains
|
|
30
|
+
#
|
|
31
|
+
# q1 = Customer1.
|
|
32
|
+
# pig_load('q4_reports/customers.tsv').set!.
|
|
33
|
+
# distinct.set! ;
|
|
34
|
+
# q1.
|
|
35
|
+
# group(:by => :postal_code).set!.
|
|
36
|
+
# generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
|
|
37
|
+
# store!
|
|
38
|
+
#
|
|
39
|
+
# Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
|
|
40
|
+
# Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
|
|
41
|
+
# Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
|
|
42
|
+
# Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
|
|
43
|
+
# group AS postal_code,
|
|
44
|
+
# COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
|
|
45
|
+
#
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
#
|
|
48
|
+
# Note on pig:
|
|
49
|
+
#
|
|
50
|
+
# 1) Reverse the order of your tables in your join statement. Pig always
|
|
51
|
+
# streams the keys of the last input, (materializing in memory the keys of
|
|
52
|
+
# the first), so if one of your inputs has less instances of of a given key
|
|
53
|
+
# this may help.
|
|
54
|
+
#
|
|
55
|
+
# 2) Reduce the number of maps and reducers per machine and give it all the
|
|
56
|
+
# memory you can.
|
|
57
|
+
#
|
|
58
|
+
#
|
|
59
|
+
module AndPig
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Wukong::AndPig is a small library to more easily generate code for the
|
|
2
|
+
"Pig":http://hadoop.apache.org/pig data analysis language.
|
|
3
|
+
|
|
4
|
+
Wukong::AndPig lets you use the structs from your Wukong scripts to
|
|
5
|
+
generate Pig instructions that know their types and structure -- even through
|
|
6
|
+
multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
|
|
7
|
+
only a few of those fields, Wukong::AndPig will know that the result has only
|
|
8
|
+
those fields.
|
|
9
|
+
|
|
10
|
+
We're still trying to figure out if this is a stupid and crazy idea, or just a
|
|
11
|
+
crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
|
|
12
|
+
imperative query language that generates Java code for ad-hoc map-reduce operations.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
class AS
|
|
2
|
+
attr_accessor :expr, :name, :type, :ref, :options
|
|
3
|
+
def initialize expr, name=nil, type=nil, ref=nil, *option_flags
|
|
4
|
+
case expr
|
|
5
|
+
when AS
|
|
6
|
+
self.expr = expr.expr
|
|
7
|
+
self.name = expr.name
|
|
8
|
+
self.type = expr.type
|
|
9
|
+
self.ref = expr.ref
|
|
10
|
+
self.options = expr.options
|
|
11
|
+
end
|
|
12
|
+
self.expr ||= expr
|
|
13
|
+
self.name = name if name
|
|
14
|
+
self.type = type if type
|
|
15
|
+
self.ref = ref if ref
|
|
16
|
+
self.options ||= { }
|
|
17
|
+
option_flags.each{|option| self.options[option] = true }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def to_s
|
|
21
|
+
clause = "%-30s \t" % [ref, expr].compact.join('::')
|
|
22
|
+
if name
|
|
23
|
+
clause << "AS #{name}" unless options[:skip_name]
|
|
24
|
+
clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
|
|
25
|
+
end
|
|
26
|
+
clause
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.[] *args
|
|
30
|
+
self.new *args
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Useful for feeding back into TypedStruct
|
|
34
|
+
def name_type
|
|
35
|
+
[name, type]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# == SimpleDataTypes ==
|
|
2
|
+
# int
|
|
3
|
+
# long
|
|
4
|
+
# double
|
|
5
|
+
# arrays
|
|
6
|
+
# chararray
|
|
7
|
+
# bytearray
|
|
8
|
+
#
|
|
9
|
+
# == ComplexDataTypes ==
|
|
10
|
+
# tuple
|
|
11
|
+
# bag
|
|
12
|
+
# map
|
|
13
|
+
|
|
14
|
+
module Wukong
|
|
15
|
+
module AndPig
|
|
16
|
+
class PigVar
|
|
17
|
+
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# class ScalarInteger < TypedStruct.new [
|
|
23
|
+
# [:count, Integer ],
|
|
24
|
+
# ]
|
|
25
|
+
# include Wukong::AndPig::PigEmitter
|
|
26
|
+
# def self.load_scalar path
|
|
27
|
+
# var = super path
|
|
28
|
+
# var.to_i
|
|
29
|
+
# end
|
|
30
|
+
# end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
|
|
2
|
+
# == Built-in Functions
|
|
3
|
+
# EvalFunctions
|
|
4
|
+
# AVG
|
|
5
|
+
# CONCAT
|
|
6
|
+
# COUNT
|
|
7
|
+
# DIFF
|
|
8
|
+
# MIN
|
|
9
|
+
# MAX
|
|
10
|
+
# SIZE
|
|
11
|
+
# SUM
|
|
12
|
+
# TOKENIZE
|
|
13
|
+
|
|
14
|
+
# == NullOperators
|
|
15
|
+
# isnull
|
|
16
|
+
# isnotnull
|
|
17
|
+
#
|
|
18
|
+
# == BooleanOperators
|
|
19
|
+
# and
|
|
20
|
+
# or
|
|
21
|
+
# not
|
|
22
|
+
#
|
|
23
|
+
# == DereferenceOperators
|
|
24
|
+
# tupledereference.
|
|
25
|
+
# mapdereference#
|
|
26
|
+
#
|
|
27
|
+
# == SignOperators
|
|
28
|
+
# positive+
|
|
29
|
+
# negative-
|
|
30
|
+
#
|
|
31
|
+
# == CastOperators
|
|
32
|
+
# (type)$0
|
|
33
|
+
# (type)alias
|
|
34
|
+
#
|
|
35
|
+
# == ArithmeticOperators
|
|
36
|
+
# addition+
|
|
37
|
+
# subtraction-
|
|
38
|
+
# multiplication*
|
|
39
|
+
# division/
|
|
40
|
+
# modulo%
|
|
41
|
+
# bincond?
|
|
42
|
+
#
|
|
43
|
+
# == ComparisonOperators
|
|
44
|
+
# Equal==
|
|
45
|
+
# notequal!=
|
|
46
|
+
# lessthan<
|
|
47
|
+
# greaterthan>
|
|
48
|
+
# lessthanorequalto<=
|
|
49
|
+
# greaterthanorequalto>=
|
|
50
|
+
# patternmatchingmatches
|