wukong 0.1.4 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL.textile +89 -0
- data/README.textile +41 -74
- data/docpages/INSTALL.textile +94 -0
- data/{doc → docpages}/LICENSE.textile +0 -0
- data/{doc → docpages}/README-wulign.textile +6 -0
- data/docpages/UsingWukong-part1-get_ready.textile +17 -0
- data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
- data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
- data/docpages/_config.yml +39 -0
- data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
- data/{doc → docpages}/code/api_response_example.txt +0 -0
- data/{doc → docpages}/code/parser_skeleton.rb +0 -0
- data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +16 -0
- data/docpages/hadoop-tips.textile +83 -0
- data/docpages/index.textile +90 -0
- data/docpages/intro.textile +8 -0
- data/docpages/moreinfo.textile +174 -0
- data/docpages/news.html +24 -0
- data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
- data/docpages/tutorial.textile +283 -0
- data/docpages/usage.textile +195 -0
- data/docpages/wutils.textile +263 -0
- data/wukong.gemspec +80 -50
- metadata +87 -54
- data/doc/INSTALL.textile +0 -41
- data/doc/README-tutorial.textile +0 -163
- data/doc/README-wutils.textile +0 -128
- data/doc/TODO.textile +0 -61
- data/doc/UsingWukong-part1-setup.textile +0 -2
- data/doc/UsingWukong-part2-scraping.textile +0 -2
- data/doc/hadoop-nfs.textile +0 -51
- data/doc/hadoop-setup.textile +0 -29
- data/doc/index.textile +0 -124
- data/doc/links.textile +0 -42
- data/doc/usage.textile +0 -102
- data/doc/utils.textile +0 -48
- data/examples/and_pig/sample_queries.rb +0 -128
- data/lib/wukong/and_pig.rb +0 -62
- data/lib/wukong/and_pig/README.textile +0 -12
- data/lib/wukong/and_pig/as.rb +0 -37
- data/lib/wukong/and_pig/data_types.rb +0 -30
- data/lib/wukong/and_pig/functions.rb +0 -50
- data/lib/wukong/and_pig/generate.rb +0 -85
- data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
- data/lib/wukong/and_pig/junk.rb +0 -51
- data/lib/wukong/and_pig/operators.rb +0 -8
- data/lib/wukong/and_pig/operators/compound.rb +0 -29
- data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
- data/lib/wukong/and_pig/operators/execution.rb +0 -15
- data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
- data/lib/wukong/and_pig/operators/foreach.rb +0 -98
- data/lib/wukong/and_pig/operators/groupies.rb +0 -212
- data/lib/wukong/and_pig/operators/load_store.rb +0 -65
- data/lib/wukong/and_pig/operators/meta.rb +0 -42
- data/lib/wukong/and_pig/operators/relational.rb +0 -129
- data/lib/wukong/and_pig/pig_struct.rb +0 -48
- data/lib/wukong/and_pig/pig_var.rb +0 -95
- data/lib/wukong/and_pig/symbol.rb +0 -29
- data/lib/wukong/and_pig/utils.rb +0 -0
data/lib/wukong/and_pig.rb
DELETED
@@ -1,62 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/pig_var'
|
2
|
-
require 'wukong/and_pig/as'
|
3
|
-
require 'wukong/and_pig/functions'
|
4
|
-
require 'wukong/and_pig/operators'
|
5
|
-
require 'wukong/and_pig/data_types'
|
6
|
-
require 'wukong/and_pig/pig_struct'
|
7
|
-
require 'wukong/and_pig/generate'
|
8
|
-
require 'wukong/and_pig/symbol'
|
9
|
-
require 'wukong/and_pig/utils'
|
10
|
-
|
11
|
-
module Wukong
|
12
|
-
#
|
13
|
-
# Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
|
14
|
-
# code from within ruby (and interactively, from the +irb+ console).
|
15
|
-
#
|
16
|
-
# It uses the same typed structures you've defined for Wukong to create
|
17
|
-
# pig-types aware commands. For example, the Wukong class
|
18
|
-
#
|
19
|
-
# class Customer < TypedStruct.new( [:id, Integer],
|
20
|
-
# [:name, String], [:postal_code, Integer], [:balance, Float] )
|
21
|
-
# end
|
22
|
-
#
|
23
|
-
# will generate a LOAD command for pig as
|
24
|
-
#
|
25
|
-
# Customer1.pig_load('q4_reports/customers.tsv').set!
|
26
|
-
# # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
|
27
|
-
# AS (id: int, name: chararray, postal_code: int, balance: float) ;
|
28
|
-
#
|
29
|
-
# You can write anonymous chains
|
30
|
-
#
|
31
|
-
# q1 = Customer1.
|
32
|
-
# pig_load('q4_reports/customers.tsv').set!.
|
33
|
-
# distinct.set! ;
|
34
|
-
# q1.
|
35
|
-
# group(:by => :postal_code).set!.
|
36
|
-
# generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
|
37
|
-
# store!
|
38
|
-
#
|
39
|
-
# Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
|
40
|
-
# Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
|
41
|
-
# Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
|
42
|
-
# Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
|
43
|
-
# group AS postal_code,
|
44
|
-
# COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
|
45
|
-
#
|
46
|
-
# ---------------------------------------------------------------------------
|
47
|
-
#
|
48
|
-
# Note on pig:
|
49
|
-
#
|
50
|
-
# 1) Reverse the order of your tables in your join statement. Pig always
|
51
|
-
# streams the keys of the last input, (materializing in memory the keys of
|
52
|
-
# the first), so if one of your inputs has less instances of of a given key
|
53
|
-
# this may help.
|
54
|
-
#
|
55
|
-
# 2) Reduce the number of maps and reducers per machine and give it all the
|
56
|
-
# memory you can.
|
57
|
-
#
|
58
|
-
#
|
59
|
-
module AndPig
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
@@ -1,12 +0,0 @@
|
|
1
|
-
Wukong::AndPig is a small library to more easily generate code for the
|
2
|
-
"Pig":http://hadoop.apache.org/pig data analysis language.
|
3
|
-
|
4
|
-
Wukong::AndPig lets you use the structs from your Wukong scripts to
|
5
|
-
generate Pig instructions that know their types and structure -- even through
|
6
|
-
multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
|
7
|
-
only a few of those fields, Wukong::AndPig will know that the result has only
|
8
|
-
those fields.
|
9
|
-
|
10
|
-
We're still trying to figure out if this is a stupid and crazy idea, or just a
|
11
|
-
crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
|
12
|
-
imperative query language that generates Java code for ad-hoc map-reduce operations.
|
data/lib/wukong/and_pig/as.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
class AS
|
2
|
-
attr_accessor :expr, :name, :type, :ref, :options
|
3
|
-
def initialize expr, name=nil, type=nil, ref=nil, *option_flags
|
4
|
-
case expr
|
5
|
-
when AS
|
6
|
-
self.expr = expr.expr
|
7
|
-
self.name = expr.name
|
8
|
-
self.type = expr.type
|
9
|
-
self.ref = expr.ref
|
10
|
-
self.options = expr.options
|
11
|
-
end
|
12
|
-
self.expr ||= expr
|
13
|
-
self.name = name if name
|
14
|
-
self.type = type if type
|
15
|
-
self.ref = ref if ref
|
16
|
-
self.options ||= { }
|
17
|
-
option_flags.each{|option| self.options[option] = true }
|
18
|
-
end
|
19
|
-
|
20
|
-
def to_s
|
21
|
-
clause = "%-30s \t" % [ref, expr].compact.join('::')
|
22
|
-
if name
|
23
|
-
clause << "AS #{name}" unless options[:skip_name]
|
24
|
-
clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
|
25
|
-
end
|
26
|
-
clause
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.[] *args
|
30
|
-
self.new *args
|
31
|
-
end
|
32
|
-
|
33
|
-
# Useful for feeding back into TypedStruct
|
34
|
-
def name_type
|
35
|
-
[name, type]
|
36
|
-
end
|
37
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
# == SimpleDataTypes ==
|
2
|
-
# int
|
3
|
-
# long
|
4
|
-
# double
|
5
|
-
# arrays
|
6
|
-
# chararray
|
7
|
-
# bytearray
|
8
|
-
#
|
9
|
-
# == ComplexDataTypes ==
|
10
|
-
# tuple
|
11
|
-
# bag
|
12
|
-
# map
|
13
|
-
|
14
|
-
module Wukong
|
15
|
-
module AndPig
|
16
|
-
class PigVar
|
17
|
-
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# class ScalarInteger < TypedStruct.new [
|
23
|
-
# [:count, Integer ],
|
24
|
-
# ]
|
25
|
-
# include Wukong::AndPig::PigEmitter
|
26
|
-
# def self.load_scalar path
|
27
|
-
# var = super path
|
28
|
-
# var.to_i
|
29
|
-
# end
|
30
|
-
# end
|
@@ -1,50 +0,0 @@
|
|
1
|
-
|
2
|
-
# == Built-in Functions
|
3
|
-
# EvalFunctions
|
4
|
-
# AVG
|
5
|
-
# CONCAT
|
6
|
-
# COUNT
|
7
|
-
# DIFF
|
8
|
-
# MIN
|
9
|
-
# MAX
|
10
|
-
# SIZE
|
11
|
-
# SUM
|
12
|
-
# TOKENIZE
|
13
|
-
|
14
|
-
# == NullOperators
|
15
|
-
# isnull
|
16
|
-
# isnotnull
|
17
|
-
#
|
18
|
-
# == BooleanOperators
|
19
|
-
# and
|
20
|
-
# or
|
21
|
-
# not
|
22
|
-
#
|
23
|
-
# == DereferenceOperators
|
24
|
-
# tupledereference.
|
25
|
-
# mapdereference#
|
26
|
-
#
|
27
|
-
# == SignOperators
|
28
|
-
# positive+
|
29
|
-
# negative-
|
30
|
-
#
|
31
|
-
# == CastOperators
|
32
|
-
# (type)$0
|
33
|
-
# (type)alias
|
34
|
-
#
|
35
|
-
# == ArithmeticOperators
|
36
|
-
# addition+
|
37
|
-
# subtraction-
|
38
|
-
# multiplication*
|
39
|
-
# division/
|
40
|
-
# modulo%
|
41
|
-
# bincond?
|
42
|
-
#
|
43
|
-
# == ComparisonOperators
|
44
|
-
# Equal==
|
45
|
-
# notequal!=
|
46
|
-
# lessthan<
|
47
|
-
# greaterthan>
|
48
|
-
# lessthanorequalto<=
|
49
|
-
# greaterthanorequalto>=
|
50
|
-
# patternmatchingmatches
|
@@ -1,85 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/generate/variable_inflections'
|
2
|
-
|
3
|
-
module Wukong
|
4
|
-
module AndPig
|
5
|
-
|
6
|
-
mattr_accessor :comments
|
7
|
-
self.comments = true
|
8
|
-
# send output to stdout or to captured pig instance
|
9
|
-
mattr_accessor :emit_dest
|
10
|
-
# full pathname to the pig executable
|
11
|
-
PIG_EXECUTABLE = '/usr/local/bin/pig'
|
12
|
-
|
13
|
-
def self.finish
|
14
|
-
PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
|
15
|
-
end
|
16
|
-
|
17
|
-
#
|
18
|
-
# All the embarrassing magick to pretend ruby symbols are pig relations
|
19
|
-
#
|
20
|
-
class PigVar
|
21
|
-
|
22
|
-
# Output a command
|
23
|
-
def self.emit cmd, semicolon=true
|
24
|
-
cmd = cmd + ' ;' if semicolon
|
25
|
-
case Wukong::AndPig.emit_dest
|
26
|
-
when :captured
|
27
|
-
pig_in_poke.puts(cmd)
|
28
|
-
pig_in_poke.flush
|
29
|
-
puts pig_in_poke.gets
|
30
|
-
else
|
31
|
-
puts(cmd)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# generate the code
|
36
|
-
def self.emit_setter relation, rval
|
37
|
-
emit "%-23s\t= %s" % [relation, rval.cmd]
|
38
|
-
rval
|
39
|
-
end
|
40
|
-
|
41
|
-
# generate the code
|
42
|
-
def self.emit_imperative imperative, *rest
|
43
|
-
cmd_part = "%-14s \t" % imperative
|
44
|
-
arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
|
45
|
-
emit cmd_part+arg_part
|
46
|
-
rest.first
|
47
|
-
end
|
48
|
-
|
49
|
-
def self.pig_in_poke
|
50
|
-
return @pig_in_poke if @pig_in_poke
|
51
|
-
case Wukong::AndPig.emit_dest
|
52
|
-
when :captured
|
53
|
-
@pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
|
54
|
-
@pig_in_poke.sync = true
|
55
|
-
@pig_in_poke
|
56
|
-
else @pig_in_poke = $stdout
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# Reset the captured pig instance
|
62
|
-
#
|
63
|
-
def self.reset_pig_in_poke!
|
64
|
-
begin pig_in_poke.close ; rescue nil ; end
|
65
|
-
@pig_in_poke = nil
|
66
|
-
end
|
67
|
-
|
68
|
-
def set!
|
69
|
-
self.class.emit_setter(relation, self)
|
70
|
-
end
|
71
|
-
|
72
|
-
#
|
73
|
-
# Emit a comment
|
74
|
-
# skips if Wukong::AndPig.comments is false
|
75
|
-
#
|
76
|
-
def self.rem comment
|
77
|
-
return unless Wukong::AndPig.comments
|
78
|
-
PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
|
@@ -1,82 +0,0 @@
|
|
1
|
-
String.class_eval do
|
2
|
-
#
|
3
|
-
# Generate relation name from a handle
|
4
|
-
#
|
5
|
-
def relationize() camelize end
|
6
|
-
end
|
7
|
-
Symbol.class_eval do
|
8
|
-
#
|
9
|
-
# Generate relation name from a handle
|
10
|
-
#
|
11
|
-
def relationize
|
12
|
-
to_s.relationize
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
Object.class_eval do
|
17
|
-
def typify() self.class ; end
|
18
|
-
|
19
|
-
def symbolize
|
20
|
-
self.to_s.underscore.gsub(%r{.*/}, '').to_sym
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
class << Integer ; def typify() 'int' end ; end
|
25
|
-
class << Bignum ; def typify() 'long' end ; end
|
26
|
-
class << Float ; def typify() 'float' end ; end
|
27
|
-
class << String ; def typify() 'chararray' end ; end
|
28
|
-
class << Symbol ; def typify() self end ; end
|
29
|
-
class << Date ; def typify() 'long' end ; end
|
30
|
-
|
31
|
-
# Array.class_eval do
|
32
|
-
# def typify()
|
33
|
-
# "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
|
34
|
-
# end
|
35
|
-
# end
|
36
|
-
# class Tuple
|
37
|
-
# attr_accessor :contents
|
38
|
-
# def initialize *args
|
39
|
-
# self.contents = args
|
40
|
-
# end
|
41
|
-
# def typify
|
42
|
-
# "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
|
43
|
-
# end
|
44
|
-
# #
|
45
|
-
# # Sugar for creating a new bag. The following are equivalent:
|
46
|
-
# #
|
47
|
-
# # Bag[:foo]
|
48
|
-
# # Bag.new :foo
|
49
|
-
# #
|
50
|
-
# def self.[] *args
|
51
|
-
# new *args
|
52
|
-
# end
|
53
|
-
# end
|
54
|
-
|
55
|
-
module BagMethods
|
56
|
-
module ClassMethods
|
57
|
-
#
|
58
|
-
# Pig type string --
|
59
|
-
# the pig type strings for each sub-element.
|
60
|
-
#
|
61
|
-
def typify
|
62
|
-
vars_str = members.zip(mtypes).map do |attr, mtype|
|
63
|
-
"%s: %s" % [attr, mtype.typify]
|
64
|
-
end
|
65
|
-
"{ #{vars_str.join(', ')} }"
|
66
|
-
end
|
67
|
-
end
|
68
|
-
def self.included base
|
69
|
-
base.extend ClassMethods
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
class Bag < TypedStruct
|
74
|
-
def self.new *args
|
75
|
-
bag = super *args
|
76
|
-
bag.class_eval{ include BagMethods }
|
77
|
-
end
|
78
|
-
def self.[] *args
|
79
|
-
new *args
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
data/lib/wukong/and_pig/junk.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module Wukong
|
4
|
-
module AndPig
|
5
|
-
|
6
|
-
#
|
7
|
-
# Load the main class definitions
|
8
|
-
#
|
9
|
-
def self.init_load
|
10
|
-
puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
|
11
|
-
end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
# OK we're going to cheat here:
|
18
|
-
# just cat the file in, and treat it as a scalar
|
19
|
-
#
|
20
|
-
def load_scalar path
|
21
|
-
# var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
|
22
|
-
var = "636"
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
def count_distinct dest_rel, attr, group_by
|
28
|
-
distincted =
|
29
|
-
generate(temp_rel(dest_rel), attr).
|
30
|
-
distinct(temp_rel(dest_rel), :parallel => 10)
|
31
|
-
distincted.
|
32
|
-
group( temp_rel(dest_rel), group_by).
|
33
|
-
foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
|
34
|
-
end
|
35
|
-
|
36
|
-
#
|
37
|
-
# Group a relation into bins, and return the counts for each bin
|
38
|
-
# * dest_rel - Relation to store
|
39
|
-
# {bin,
|
40
|
-
#
|
41
|
-
def histogram dest_rel, bin_attr, bin_expr=nil
|
42
|
-
bin_expr ||= bin_attr
|
43
|
-
bin_name = "#{bin_attr}_bin"
|
44
|
-
binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
|
45
|
-
binned. group( temp_rel(dest_rel), :by => bin_name).
|
46
|
-
foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/operators/evaluators'
|
2
|
-
require 'wukong/and_pig/operators/foreach'
|
3
|
-
require 'wukong/and_pig/operators/groupies'
|
4
|
-
require 'wukong/and_pig/operators/load_store'
|
5
|
-
require 'wukong/and_pig/operators/meta'
|
6
|
-
require 'wukong/and_pig/operators/relational'
|
7
|
-
require 'wukong/and_pig/operators/file_methods'
|
8
|
-
require 'wukong/and_pig/operators/compound'
|
@@ -1,29 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# The FOREACH relational operator
|
3
|
-
#
|
4
|
-
module Wukong
|
5
|
-
module AndPig
|
6
|
-
class PigVar
|
7
|
-
#
|
8
|
-
# Select all elements in the source relation that match on the selecting relation,
|
9
|
-
# creating a relation with the same type as the source relation.
|
10
|
-
#
|
11
|
-
# For example,
|
12
|
-
#
|
13
|
-
# PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
|
14
|
-
#
|
15
|
-
# returns a relation IsolatedCvals, whose type is identical to
|
16
|
-
# MyComplicatedValues' type, with only the elements having an id also
|
17
|
-
# presend in MyIds.
|
18
|
-
#
|
19
|
-
#
|
20
|
-
def self.isolate lval, on, on_field, from, from_field, options={ }
|
21
|
-
joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
|
22
|
-
isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
|
23
|
-
isolated.klass = from.klass
|
24
|
-
isolated
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|