wukong 0.1.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,62 +0,0 @@
1
- require 'wukong/and_pig/pig_var'
2
- require 'wukong/and_pig/as'
3
- require 'wukong/and_pig/functions'
4
- require 'wukong/and_pig/operators'
5
- require 'wukong/and_pig/data_types'
6
- require 'wukong/and_pig/pig_struct'
7
- require 'wukong/and_pig/generate'
8
- require 'wukong/and_pig/symbol'
9
- require 'wukong/and_pig/utils'
10
-
11
- module Wukong
12
- #
13
- # Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
14
- # code from within ruby (and interactively, from the +irb+ console).
15
- #
16
- # It uses the same typed structures you've defined for Wukong to create
17
- # pig-types aware commands. For example, the Wukong class
18
- #
19
- # class Customer < TypedStruct.new( [:id, Integer],
20
- # [:name, String], [:postal_code, Integer], [:balance, Float] )
21
- # end
22
- #
23
- # will generate a LOAD command for pig as
24
- #
25
- # Customer1.pig_load('q4_reports/customers.tsv').set!
26
- # # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
27
- # AS (id: int, name: chararray, postal_code: int, balance: float) ;
28
- #
29
- # You can write anonymous chains
30
- #
31
- # q1 = Customer1.
32
- # pig_load('q4_reports/customers.tsv').set!.
33
- # distinct.set! ;
34
- # q1.
35
- # group(:by => :postal_code).set!.
36
- # generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
37
- # store!
38
- #
39
- # Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
40
- # Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
41
- # Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
42
- # Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
43
- # group AS postal_code,
44
- # COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
45
- #
46
- # ---------------------------------------------------------------------------
47
- #
48
- # Note on pig:
49
- #
50
- # 1) Reverse the order of your tables in your join statement. Pig always
51
- # streams the keys of the last input, (materializing in memory the keys of
52
- # the first), so if one of your inputs has less instances of of a given key
53
- # this may help.
54
- #
55
- # 2) Reduce the number of maps and reducers per machine and give it all the
56
- # memory you can.
57
- #
58
- #
59
- module AndPig
60
- end
61
- end
62
-
@@ -1,12 +0,0 @@
1
- Wukong::AndPig is a small library to more easily generate code for the
2
- "Pig":http://hadoop.apache.org/pig data analysis language.
3
-
4
- Wukong::AndPig lets you use the structs from your Wukong scripts to
5
- generate Pig instructions that know their types and structure -- even through
6
- multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
7
- only a few of those fields, Wukong::AndPig will know that the result has only
8
- those fields.
9
-
10
- We're still trying to figure out if this is a stupid and crazy idea, or just a
11
- crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
12
- imperative query language that generates Java code for ad-hoc map-reduce operations.
@@ -1,37 +0,0 @@
1
- class AS
2
- attr_accessor :expr, :name, :type, :ref, :options
3
- def initialize expr, name=nil, type=nil, ref=nil, *option_flags
4
- case expr
5
- when AS
6
- self.expr = expr.expr
7
- self.name = expr.name
8
- self.type = expr.type
9
- self.ref = expr.ref
10
- self.options = expr.options
11
- end
12
- self.expr ||= expr
13
- self.name = name if name
14
- self.type = type if type
15
- self.ref = ref if ref
16
- self.options ||= { }
17
- option_flags.each{|option| self.options[option] = true }
18
- end
19
-
20
- def to_s
21
- clause = "%-30s \t" % [ref, expr].compact.join('::')
22
- if name
23
- clause << "AS #{name}" unless options[:skip_name]
24
- clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
25
- end
26
- clause
27
- end
28
-
29
- def self.[] *args
30
- self.new *args
31
- end
32
-
33
- # Useful for feeding back into TypedStruct
34
- def name_type
35
- [name, type]
36
- end
37
- end
@@ -1,30 +0,0 @@
1
- # == SimpleDataTypes ==
2
- # int
3
- # long
4
- # double
5
- # arrays
6
- # chararray
7
- # bytearray
8
- #
9
- # == ComplexDataTypes ==
10
- # tuple
11
- # bag
12
- # map
13
-
14
- module Wukong
15
- module AndPig
16
- class PigVar
17
-
18
- end
19
- end
20
- end
21
-
22
- # class ScalarInteger < TypedStruct.new [
23
- # [:count, Integer ],
24
- # ]
25
- # include Wukong::AndPig::PigEmitter
26
- # def self.load_scalar path
27
- # var = super path
28
- # var.to_i
29
- # end
30
- # end
@@ -1,50 +0,0 @@
1
-
2
- # == Built-in Functions
3
- # EvalFunctions
4
- # AVG
5
- # CONCAT
6
- # COUNT
7
- # DIFF
8
- # MIN
9
- # MAX
10
- # SIZE
11
- # SUM
12
- # TOKENIZE
13
-
14
- # == NullOperators
15
- # isnull
16
- # isnotnull
17
- #
18
- # == BooleanOperators
19
- # and
20
- # or
21
- # not
22
- #
23
- # == DereferenceOperators
24
- # tupledereference.
25
- # mapdereference#
26
- #
27
- # == SignOperators
28
- # positive+
29
- # negative-
30
- #
31
- # == CastOperators
32
- # (type)$0
33
- # (type)alias
34
- #
35
- # == ArithmeticOperators
36
- # addition+
37
- # subtraction-
38
- # multiplication*
39
- # division/
40
- # modulo%
41
- # bincond?
42
- #
43
- # == ComparisonOperators
44
- # Equal==
45
- # notequal!=
46
- # lessthan<
47
- # greaterthan>
48
- # lessthanorequalto<=
49
- # greaterthanorequalto>=
50
- # patternmatchingmatches
@@ -1,85 +0,0 @@
1
- require 'wukong/and_pig/generate/variable_inflections'
2
-
3
- module Wukong
4
- module AndPig
5
-
6
- mattr_accessor :comments
7
- self.comments = true
8
- # send output to stdout or to captured pig instance
9
- mattr_accessor :emit_dest
10
- # full pathname to the pig executable
11
- PIG_EXECUTABLE = '/usr/local/bin/pig'
12
-
13
- def self.finish
14
- PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
15
- end
16
-
17
- #
18
- # All the embarrassing magick to pretend ruby symbols are pig relations
19
- #
20
- class PigVar
21
-
22
- # Output a command
23
- def self.emit cmd, semicolon=true
24
- cmd = cmd + ' ;' if semicolon
25
- case Wukong::AndPig.emit_dest
26
- when :captured
27
- pig_in_poke.puts(cmd)
28
- pig_in_poke.flush
29
- puts pig_in_poke.gets
30
- else
31
- puts(cmd)
32
- end
33
- end
34
-
35
- # generate the code
36
- def self.emit_setter relation, rval
37
- emit "%-23s\t= %s" % [relation, rval.cmd]
38
- rval
39
- end
40
-
41
- # generate the code
42
- def self.emit_imperative imperative, *rest
43
- cmd_part = "%-14s \t" % imperative
44
- arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
45
- emit cmd_part+arg_part
46
- rest.first
47
- end
48
-
49
- def self.pig_in_poke
50
- return @pig_in_poke if @pig_in_poke
51
- case Wukong::AndPig.emit_dest
52
- when :captured
53
- @pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
54
- @pig_in_poke.sync = true
55
- @pig_in_poke
56
- else @pig_in_poke = $stdout
57
- end
58
- end
59
-
60
- #
61
- # Reset the captured pig instance
62
- #
63
- def self.reset_pig_in_poke!
64
- begin pig_in_poke.close ; rescue nil ; end
65
- @pig_in_poke = nil
66
- end
67
-
68
- def set!
69
- self.class.emit_setter(relation, self)
70
- end
71
-
72
- #
73
- # Emit a comment
74
- # skips if Wukong::AndPig.comments is false
75
- #
76
- def self.rem comment
77
- return unless Wukong::AndPig.comments
78
- PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
79
- end
80
- end
81
-
82
- end
83
- end
84
-
85
-
@@ -1,82 +0,0 @@
1
- String.class_eval do
2
- #
3
- # Generate relation name from a handle
4
- #
5
- def relationize() camelize end
6
- end
7
- Symbol.class_eval do
8
- #
9
- # Generate relation name from a handle
10
- #
11
- def relationize
12
- to_s.relationize
13
- end
14
- end
15
-
16
- Object.class_eval do
17
- def typify() self.class ; end
18
-
19
- def symbolize
20
- self.to_s.underscore.gsub(%r{.*/}, '').to_sym
21
- end
22
- end
23
-
24
- class << Integer ; def typify() 'int' end ; end
25
- class << Bignum ; def typify() 'long' end ; end
26
- class << Float ; def typify() 'float' end ; end
27
- class << String ; def typify() 'chararray' end ; end
28
- class << Symbol ; def typify() self end ; end
29
- class << Date ; def typify() 'long' end ; end
30
-
31
- # Array.class_eval do
32
- # def typify()
33
- # "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
34
- # end
35
- # end
36
- # class Tuple
37
- # attr_accessor :contents
38
- # def initialize *args
39
- # self.contents = args
40
- # end
41
- # def typify
42
- # "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
43
- # end
44
- # #
45
- # # Sugar for creating a new bag. The following are equivalent:
46
- # #
47
- # # Bag[:foo]
48
- # # Bag.new :foo
49
- # #
50
- # def self.[] *args
51
- # new *args
52
- # end
53
- # end
54
-
55
- module BagMethods
56
- module ClassMethods
57
- #
58
- # Pig type string --
59
- # the pig type strings for each sub-element.
60
- #
61
- def typify
62
- vars_str = members.zip(mtypes).map do |attr, mtype|
63
- "%s: %s" % [attr, mtype.typify]
64
- end
65
- "{ #{vars_str.join(', ')} }"
66
- end
67
- end
68
- def self.included base
69
- base.extend ClassMethods
70
- end
71
- end
72
-
73
- class Bag < TypedStruct
74
- def self.new *args
75
- bag = super *args
76
- bag.class_eval{ include BagMethods }
77
- end
78
- def self.[] *args
79
- new *args
80
- end
81
- end
82
-
@@ -1,51 +0,0 @@
1
-
2
-
3
- module Wukong
4
- module AndPig
5
-
6
- #
7
- # Load the main class definitions
8
- #
9
- def self.init_load
10
- puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
11
- end
12
-
13
-
14
-
15
-
16
- #
17
- # OK we're going to cheat here:
18
- # just cat the file in, and treat it as a scalar
19
- #
20
- def load_scalar path
21
- # var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
22
- var = "636"
23
- end
24
-
25
-
26
-
27
- def count_distinct dest_rel, attr, group_by
28
- distincted =
29
- generate(temp_rel(dest_rel), attr).
30
- distinct(temp_rel(dest_rel), :parallel => 10)
31
- distincted.
32
- group( temp_rel(dest_rel), group_by).
33
- foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
34
- end
35
-
36
- #
37
- # Group a relation into bins, and return the counts for each bin
38
- # * dest_rel - Relation to store
39
- # {bin,
40
- #
41
- def histogram dest_rel, bin_attr, bin_expr=nil
42
- bin_expr ||= bin_attr
43
- bin_name = "#{bin_attr}_bin"
44
- binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
45
- binned. group( temp_rel(dest_rel), :by => bin_name).
46
- foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
47
- end
48
-
49
-
50
- end
51
- end
@@ -1,8 +0,0 @@
1
- require 'wukong/and_pig/operators/evaluators'
2
- require 'wukong/and_pig/operators/foreach'
3
- require 'wukong/and_pig/operators/groupies'
4
- require 'wukong/and_pig/operators/load_store'
5
- require 'wukong/and_pig/operators/meta'
6
- require 'wukong/and_pig/operators/relational'
7
- require 'wukong/and_pig/operators/file_methods'
8
- require 'wukong/and_pig/operators/compound'
@@ -1,29 +0,0 @@
1
- #
2
- # The FOREACH relational operator
3
- #
4
- module Wukong
5
- module AndPig
6
- class PigVar
7
- #
8
- # Select all elements in the source relation that match on the selecting relation,
9
- # creating a relation with the same type as the source relation.
10
- #
11
- # For example,
12
- #
13
- # PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
14
- #
15
- # returns a relation IsolatedCvals, whose type is identical to
16
- # MyComplicatedValues' type, with only the elements having an id also
17
- # presend in MyIds.
18
- #
19
- #
20
- def self.isolate lval, on, on_field, from, from_field, options={ }
21
- joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
22
- isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
23
- isolated.klass = from.klass
24
- isolated
25
- end
26
-
27
- end
28
- end
29
- end