wukong 0.1.4 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL.textile +89 -0
- data/README.textile +41 -74
- data/docpages/INSTALL.textile +94 -0
- data/{doc → docpages}/LICENSE.textile +0 -0
- data/{doc → docpages}/README-wulign.textile +6 -0
- data/docpages/UsingWukong-part1-get_ready.textile +17 -0
- data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
- data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
- data/docpages/_config.yml +39 -0
- data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
- data/{doc → docpages}/code/api_response_example.txt +0 -0
- data/{doc → docpages}/code/parser_skeleton.rb +0 -0
- data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +16 -0
- data/docpages/hadoop-tips.textile +83 -0
- data/docpages/index.textile +90 -0
- data/docpages/intro.textile +8 -0
- data/docpages/moreinfo.textile +174 -0
- data/docpages/news.html +24 -0
- data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
- data/docpages/tutorial.textile +283 -0
- data/docpages/usage.textile +195 -0
- data/docpages/wutils.textile +263 -0
- data/wukong.gemspec +80 -50
- metadata +87 -54
- data/doc/INSTALL.textile +0 -41
- data/doc/README-tutorial.textile +0 -163
- data/doc/README-wutils.textile +0 -128
- data/doc/TODO.textile +0 -61
- data/doc/UsingWukong-part1-setup.textile +0 -2
- data/doc/UsingWukong-part2-scraping.textile +0 -2
- data/doc/hadoop-nfs.textile +0 -51
- data/doc/hadoop-setup.textile +0 -29
- data/doc/index.textile +0 -124
- data/doc/links.textile +0 -42
- data/doc/usage.textile +0 -102
- data/doc/utils.textile +0 -48
- data/examples/and_pig/sample_queries.rb +0 -128
- data/lib/wukong/and_pig.rb +0 -62
- data/lib/wukong/and_pig/README.textile +0 -12
- data/lib/wukong/and_pig/as.rb +0 -37
- data/lib/wukong/and_pig/data_types.rb +0 -30
- data/lib/wukong/and_pig/functions.rb +0 -50
- data/lib/wukong/and_pig/generate.rb +0 -85
- data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
- data/lib/wukong/and_pig/junk.rb +0 -51
- data/lib/wukong/and_pig/operators.rb +0 -8
- data/lib/wukong/and_pig/operators/compound.rb +0 -29
- data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
- data/lib/wukong/and_pig/operators/execution.rb +0 -15
- data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
- data/lib/wukong/and_pig/operators/foreach.rb +0 -98
- data/lib/wukong/and_pig/operators/groupies.rb +0 -212
- data/lib/wukong/and_pig/operators/load_store.rb +0 -65
- data/lib/wukong/and_pig/operators/meta.rb +0 -42
- data/lib/wukong/and_pig/operators/relational.rb +0 -129
- data/lib/wukong/and_pig/pig_struct.rb +0 -48
- data/lib/wukong/and_pig/pig_var.rb +0 -95
- data/lib/wukong/and_pig/symbol.rb +0 -29
- data/lib/wukong/and_pig/utils.rb +0 -0
data/lib/wukong/and_pig.rb
DELETED
@@ -1,62 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/pig_var'
|
2
|
-
require 'wukong/and_pig/as'
|
3
|
-
require 'wukong/and_pig/functions'
|
4
|
-
require 'wukong/and_pig/operators'
|
5
|
-
require 'wukong/and_pig/data_types'
|
6
|
-
require 'wukong/and_pig/pig_struct'
|
7
|
-
require 'wukong/and_pig/generate'
|
8
|
-
require 'wukong/and_pig/symbol'
|
9
|
-
require 'wukong/and_pig/utils'
|
10
|
-
|
11
|
-
module Wukong
|
12
|
-
#
|
13
|
-
# Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
|
14
|
-
# code from within ruby (and interactively, from the +irb+ console).
|
15
|
-
#
|
16
|
-
# It uses the same typed structures you've defined for Wukong to create
|
17
|
-
# pig-types aware commands. For example, the Wukong class
|
18
|
-
#
|
19
|
-
# class Customer < TypedStruct.new( [:id, Integer],
|
20
|
-
# [:name, String], [:postal_code, Integer], [:balance, Float] )
|
21
|
-
# end
|
22
|
-
#
|
23
|
-
# will generate a LOAD command for pig as
|
24
|
-
#
|
25
|
-
# Customer1.pig_load('q4_reports/customers.tsv').set!
|
26
|
-
# # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
|
27
|
-
# AS (id: int, name: chararray, postal_code: int, balance: float) ;
|
28
|
-
#
|
29
|
-
# You can write anonymous chains
|
30
|
-
#
|
31
|
-
# q1 = Customer1.
|
32
|
-
# pig_load('q4_reports/customers.tsv').set!.
|
33
|
-
# distinct.set! ;
|
34
|
-
# q1.
|
35
|
-
# group(:by => :postal_code).set!.
|
36
|
-
# generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
|
37
|
-
# store!
|
38
|
-
#
|
39
|
-
# Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
|
40
|
-
# Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
|
41
|
-
# Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
|
42
|
-
# Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
|
43
|
-
# group AS postal_code,
|
44
|
-
# COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
|
45
|
-
#
|
46
|
-
# ---------------------------------------------------------------------------
|
47
|
-
#
|
48
|
-
# Note on pig:
|
49
|
-
#
|
50
|
-
# 1) Reverse the order of your tables in your join statement. Pig always
|
51
|
-
# streams the keys of the last input, (materializing in memory the keys of
|
52
|
-
# the first), so if one of your inputs has less instances of of a given key
|
53
|
-
# this may help.
|
54
|
-
#
|
55
|
-
# 2) Reduce the number of maps and reducers per machine and give it all the
|
56
|
-
# memory you can.
|
57
|
-
#
|
58
|
-
#
|
59
|
-
module AndPig
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
@@ -1,12 +0,0 @@
|
|
1
|
-
Wukong::AndPig is a small library to more easily generate code for the
|
2
|
-
"Pig":http://hadoop.apache.org/pig data analysis language.
|
3
|
-
|
4
|
-
Wukong::AndPig lets you use the structs from your Wukong scripts to
|
5
|
-
generate Pig instructions that know their types and structure -- even through
|
6
|
-
multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
|
7
|
-
only a few of those fields, Wukong::AndPig will know that the result has only
|
8
|
-
those fields.
|
9
|
-
|
10
|
-
We're still trying to figure out if this is a stupid and crazy idea, or just a
|
11
|
-
crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
|
12
|
-
imperative query language that generates Java code for ad-hoc map-reduce operations.
|
data/lib/wukong/and_pig/as.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
class AS
|
2
|
-
attr_accessor :expr, :name, :type, :ref, :options
|
3
|
-
def initialize expr, name=nil, type=nil, ref=nil, *option_flags
|
4
|
-
case expr
|
5
|
-
when AS
|
6
|
-
self.expr = expr.expr
|
7
|
-
self.name = expr.name
|
8
|
-
self.type = expr.type
|
9
|
-
self.ref = expr.ref
|
10
|
-
self.options = expr.options
|
11
|
-
end
|
12
|
-
self.expr ||= expr
|
13
|
-
self.name = name if name
|
14
|
-
self.type = type if type
|
15
|
-
self.ref = ref if ref
|
16
|
-
self.options ||= { }
|
17
|
-
option_flags.each{|option| self.options[option] = true }
|
18
|
-
end
|
19
|
-
|
20
|
-
def to_s
|
21
|
-
clause = "%-30s \t" % [ref, expr].compact.join('::')
|
22
|
-
if name
|
23
|
-
clause << "AS #{name}" unless options[:skip_name]
|
24
|
-
clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
|
25
|
-
end
|
26
|
-
clause
|
27
|
-
end
|
28
|
-
|
29
|
-
def self.[] *args
|
30
|
-
self.new *args
|
31
|
-
end
|
32
|
-
|
33
|
-
# Useful for feeding back into TypedStruct
|
34
|
-
def name_type
|
35
|
-
[name, type]
|
36
|
-
end
|
37
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
# == SimpleDataTypes ==
|
2
|
-
# int
|
3
|
-
# long
|
4
|
-
# double
|
5
|
-
# arrays
|
6
|
-
# chararray
|
7
|
-
# bytearray
|
8
|
-
#
|
9
|
-
# == ComplexDataTypes ==
|
10
|
-
# tuple
|
11
|
-
# bag
|
12
|
-
# map
|
13
|
-
|
14
|
-
module Wukong
|
15
|
-
module AndPig
|
16
|
-
class PigVar
|
17
|
-
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# class ScalarInteger < TypedStruct.new [
|
23
|
-
# [:count, Integer ],
|
24
|
-
# ]
|
25
|
-
# include Wukong::AndPig::PigEmitter
|
26
|
-
# def self.load_scalar path
|
27
|
-
# var = super path
|
28
|
-
# var.to_i
|
29
|
-
# end
|
30
|
-
# end
|
@@ -1,50 +0,0 @@
|
|
1
|
-
|
2
|
-
# == Built-in Functions
|
3
|
-
# EvalFunctions
|
4
|
-
# AVG
|
5
|
-
# CONCAT
|
6
|
-
# COUNT
|
7
|
-
# DIFF
|
8
|
-
# MIN
|
9
|
-
# MAX
|
10
|
-
# SIZE
|
11
|
-
# SUM
|
12
|
-
# TOKENIZE
|
13
|
-
|
14
|
-
# == NullOperators
|
15
|
-
# isnull
|
16
|
-
# isnotnull
|
17
|
-
#
|
18
|
-
# == BooleanOperators
|
19
|
-
# and
|
20
|
-
# or
|
21
|
-
# not
|
22
|
-
#
|
23
|
-
# == DereferenceOperators
|
24
|
-
# tupledereference.
|
25
|
-
# mapdereference#
|
26
|
-
#
|
27
|
-
# == SignOperators
|
28
|
-
# positive+
|
29
|
-
# negative-
|
30
|
-
#
|
31
|
-
# == CastOperators
|
32
|
-
# (type)$0
|
33
|
-
# (type)alias
|
34
|
-
#
|
35
|
-
# == ArithmeticOperators
|
36
|
-
# addition+
|
37
|
-
# subtraction-
|
38
|
-
# multiplication*
|
39
|
-
# division/
|
40
|
-
# modulo%
|
41
|
-
# bincond?
|
42
|
-
#
|
43
|
-
# == ComparisonOperators
|
44
|
-
# Equal==
|
45
|
-
# notequal!=
|
46
|
-
# lessthan<
|
47
|
-
# greaterthan>
|
48
|
-
# lessthanorequalto<=
|
49
|
-
# greaterthanorequalto>=
|
50
|
-
# patternmatchingmatches
|
@@ -1,85 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/generate/variable_inflections'
|
2
|
-
|
3
|
-
module Wukong
|
4
|
-
module AndPig
|
5
|
-
|
6
|
-
mattr_accessor :comments
|
7
|
-
self.comments = true
|
8
|
-
# send output to stdout or to captured pig instance
|
9
|
-
mattr_accessor :emit_dest
|
10
|
-
# full pathname to the pig executable
|
11
|
-
PIG_EXECUTABLE = '/usr/local/bin/pig'
|
12
|
-
|
13
|
-
def self.finish
|
14
|
-
PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
|
15
|
-
end
|
16
|
-
|
17
|
-
#
|
18
|
-
# All the embarrassing magick to pretend ruby symbols are pig relations
|
19
|
-
#
|
20
|
-
class PigVar
|
21
|
-
|
22
|
-
# Output a command
|
23
|
-
def self.emit cmd, semicolon=true
|
24
|
-
cmd = cmd + ' ;' if semicolon
|
25
|
-
case Wukong::AndPig.emit_dest
|
26
|
-
when :captured
|
27
|
-
pig_in_poke.puts(cmd)
|
28
|
-
pig_in_poke.flush
|
29
|
-
puts pig_in_poke.gets
|
30
|
-
else
|
31
|
-
puts(cmd)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# generate the code
|
36
|
-
def self.emit_setter relation, rval
|
37
|
-
emit "%-23s\t= %s" % [relation, rval.cmd]
|
38
|
-
rval
|
39
|
-
end
|
40
|
-
|
41
|
-
# generate the code
|
42
|
-
def self.emit_imperative imperative, *rest
|
43
|
-
cmd_part = "%-14s \t" % imperative
|
44
|
-
arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
|
45
|
-
emit cmd_part+arg_part
|
46
|
-
rest.first
|
47
|
-
end
|
48
|
-
|
49
|
-
def self.pig_in_poke
|
50
|
-
return @pig_in_poke if @pig_in_poke
|
51
|
-
case Wukong::AndPig.emit_dest
|
52
|
-
when :captured
|
53
|
-
@pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
|
54
|
-
@pig_in_poke.sync = true
|
55
|
-
@pig_in_poke
|
56
|
-
else @pig_in_poke = $stdout
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# Reset the captured pig instance
|
62
|
-
#
|
63
|
-
def self.reset_pig_in_poke!
|
64
|
-
begin pig_in_poke.close ; rescue nil ; end
|
65
|
-
@pig_in_poke = nil
|
66
|
-
end
|
67
|
-
|
68
|
-
def set!
|
69
|
-
self.class.emit_setter(relation, self)
|
70
|
-
end
|
71
|
-
|
72
|
-
#
|
73
|
-
# Emit a comment
|
74
|
-
# skips if Wukong::AndPig.comments is false
|
75
|
-
#
|
76
|
-
def self.rem comment
|
77
|
-
return unless Wukong::AndPig.comments
|
78
|
-
PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
|
@@ -1,82 +0,0 @@
|
|
1
|
-
String.class_eval do
|
2
|
-
#
|
3
|
-
# Generate relation name from a handle
|
4
|
-
#
|
5
|
-
def relationize() camelize end
|
6
|
-
end
|
7
|
-
Symbol.class_eval do
|
8
|
-
#
|
9
|
-
# Generate relation name from a handle
|
10
|
-
#
|
11
|
-
def relationize
|
12
|
-
to_s.relationize
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
Object.class_eval do
|
17
|
-
def typify() self.class ; end
|
18
|
-
|
19
|
-
def symbolize
|
20
|
-
self.to_s.underscore.gsub(%r{.*/}, '').to_sym
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
class << Integer ; def typify() 'int' end ; end
|
25
|
-
class << Bignum ; def typify() 'long' end ; end
|
26
|
-
class << Float ; def typify() 'float' end ; end
|
27
|
-
class << String ; def typify() 'chararray' end ; end
|
28
|
-
class << Symbol ; def typify() self end ; end
|
29
|
-
class << Date ; def typify() 'long' end ; end
|
30
|
-
|
31
|
-
# Array.class_eval do
|
32
|
-
# def typify()
|
33
|
-
# "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
|
34
|
-
# end
|
35
|
-
# end
|
36
|
-
# class Tuple
|
37
|
-
# attr_accessor :contents
|
38
|
-
# def initialize *args
|
39
|
-
# self.contents = args
|
40
|
-
# end
|
41
|
-
# def typify
|
42
|
-
# "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
|
43
|
-
# end
|
44
|
-
# #
|
45
|
-
# # Sugar for creating a new bag. The following are equivalent:
|
46
|
-
# #
|
47
|
-
# # Bag[:foo]
|
48
|
-
# # Bag.new :foo
|
49
|
-
# #
|
50
|
-
# def self.[] *args
|
51
|
-
# new *args
|
52
|
-
# end
|
53
|
-
# end
|
54
|
-
|
55
|
-
module BagMethods
|
56
|
-
module ClassMethods
|
57
|
-
#
|
58
|
-
# Pig type string --
|
59
|
-
# the pig type strings for each sub-element.
|
60
|
-
#
|
61
|
-
def typify
|
62
|
-
vars_str = members.zip(mtypes).map do |attr, mtype|
|
63
|
-
"%s: %s" % [attr, mtype.typify]
|
64
|
-
end
|
65
|
-
"{ #{vars_str.join(', ')} }"
|
66
|
-
end
|
67
|
-
end
|
68
|
-
def self.included base
|
69
|
-
base.extend ClassMethods
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
class Bag < TypedStruct
|
74
|
-
def self.new *args
|
75
|
-
bag = super *args
|
76
|
-
bag.class_eval{ include BagMethods }
|
77
|
-
end
|
78
|
-
def self.[] *args
|
79
|
-
new *args
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
data/lib/wukong/and_pig/junk.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module Wukong
|
4
|
-
module AndPig
|
5
|
-
|
6
|
-
#
|
7
|
-
# Load the main class definitions
|
8
|
-
#
|
9
|
-
def self.init_load
|
10
|
-
puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
|
11
|
-
end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
#
|
17
|
-
# OK we're going to cheat here:
|
18
|
-
# just cat the file in, and treat it as a scalar
|
19
|
-
#
|
20
|
-
def load_scalar path
|
21
|
-
# var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
|
22
|
-
var = "636"
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
def count_distinct dest_rel, attr, group_by
|
28
|
-
distincted =
|
29
|
-
generate(temp_rel(dest_rel), attr).
|
30
|
-
distinct(temp_rel(dest_rel), :parallel => 10)
|
31
|
-
distincted.
|
32
|
-
group( temp_rel(dest_rel), group_by).
|
33
|
-
foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
|
34
|
-
end
|
35
|
-
|
36
|
-
#
|
37
|
-
# Group a relation into bins, and return the counts for each bin
|
38
|
-
# * dest_rel - Relation to store
|
39
|
-
# {bin,
|
40
|
-
#
|
41
|
-
def histogram dest_rel, bin_attr, bin_expr=nil
|
42
|
-
bin_expr ||= bin_attr
|
43
|
-
bin_name = "#{bin_attr}_bin"
|
44
|
-
binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
|
45
|
-
binned. group( temp_rel(dest_rel), :by => bin_name).
|
46
|
-
foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
require 'wukong/and_pig/operators/evaluators'
|
2
|
-
require 'wukong/and_pig/operators/foreach'
|
3
|
-
require 'wukong/and_pig/operators/groupies'
|
4
|
-
require 'wukong/and_pig/operators/load_store'
|
5
|
-
require 'wukong/and_pig/operators/meta'
|
6
|
-
require 'wukong/and_pig/operators/relational'
|
7
|
-
require 'wukong/and_pig/operators/file_methods'
|
8
|
-
require 'wukong/and_pig/operators/compound'
|
@@ -1,29 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# The FOREACH relational operator
|
3
|
-
#
|
4
|
-
module Wukong
|
5
|
-
module AndPig
|
6
|
-
class PigVar
|
7
|
-
#
|
8
|
-
# Select all elements in the source relation that match on the selecting relation,
|
9
|
-
# creating a relation with the same type as the source relation.
|
10
|
-
#
|
11
|
-
# For example,
|
12
|
-
#
|
13
|
-
# PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
|
14
|
-
#
|
15
|
-
# returns a relation IsolatedCvals, whose type is identical to
|
16
|
-
# MyComplicatedValues' type, with only the elements having an id also
|
17
|
-
# presend in MyIds.
|
18
|
-
#
|
19
|
-
#
|
20
|
-
def self.isolate lval, on, on_field, from, from_field, options={ }
|
21
|
-
joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
|
22
|
-
isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
|
23
|
-
isolated.klass = from.klass
|
24
|
-
isolated
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|