wukong 0.1.4 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -1,62 +0,0 @@
1
- require 'wukong/and_pig/pig_var'
2
- require 'wukong/and_pig/as'
3
- require 'wukong/and_pig/functions'
4
- require 'wukong/and_pig/operators'
5
- require 'wukong/and_pig/data_types'
6
- require 'wukong/and_pig/pig_struct'
7
- require 'wukong/and_pig/generate'
8
- require 'wukong/and_pig/symbol'
9
- require 'wukong/and_pig/utils'
10
-
11
- module Wukong
12
- #
13
- # Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
14
- # code from within ruby (and interactively, from the +irb+ console).
15
- #
16
- # It uses the same typed structures you've defined for Wukong to create
17
- # pig-types aware commands. For example, the Wukong class
18
- #
19
- # class Customer < TypedStruct.new( [:id, Integer],
20
- # [:name, String], [:postal_code, Integer], [:balance, Float] )
21
- # end
22
- #
23
- # will generate a LOAD command for pig as
24
- #
25
- # Customer1.pig_load('q4_reports/customers.tsv').set!
26
- # # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
27
- # AS (id: int, name: chararray, postal_code: int, balance: float) ;
28
- #
29
- # You can write anonymous chains
30
- #
31
- # q1 = Customer1.
32
- # pig_load('q4_reports/customers.tsv').set!.
33
- # distinct.set! ;
34
- # q1.
35
- # group(:by => :postal_code).set!.
36
- # generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
37
- # store!
38
- #
39
- # Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
40
- # Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
41
- # Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
42
- # Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
43
- # group AS postal_code,
44
- # COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
45
- #
46
- # ---------------------------------------------------------------------------
47
- #
48
- # Note on pig:
49
- #
50
- # 1) Reverse the order of your tables in your join statement. Pig always
51
- # streams the keys of the last input, (materializing in memory the keys of
52
- # the first), so if one of your inputs has less instances of of a given key
53
- # this may help.
54
- #
55
- # 2) Reduce the number of maps and reducers per machine and give it all the
56
- # memory you can.
57
- #
58
- #
59
- module AndPig
60
- end
61
- end
62
-
@@ -1,12 +0,0 @@
1
- Wukong::AndPig is a small library to more easily generate code for the
2
- "Pig":http://hadoop.apache.org/pig data analysis language.
3
-
4
- Wukong::AndPig lets you use the structs from your Wukong scripts to
5
- generate Pig instructions that know their types and structure -- even through
6
- multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
7
- only a few of those fields, Wukong::AndPig will know that the result has only
8
- those fields.
9
-
10
- We're still trying to figure out if this is a stupid and crazy idea, or just a
11
- crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
12
- imperative query language that generates Java code for ad-hoc map-reduce operations.
@@ -1,37 +0,0 @@
1
- class AS
2
- attr_accessor :expr, :name, :type, :ref, :options
3
- def initialize expr, name=nil, type=nil, ref=nil, *option_flags
4
- case expr
5
- when AS
6
- self.expr = expr.expr
7
- self.name = expr.name
8
- self.type = expr.type
9
- self.ref = expr.ref
10
- self.options = expr.options
11
- end
12
- self.expr ||= expr
13
- self.name = name if name
14
- self.type = type if type
15
- self.ref = ref if ref
16
- self.options ||= { }
17
- option_flags.each{|option| self.options[option] = true }
18
- end
19
-
20
- def to_s
21
- clause = "%-30s \t" % [ref, expr].compact.join('::')
22
- if name
23
- clause << "AS #{name}" unless options[:skip_name]
24
- clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
25
- end
26
- clause
27
- end
28
-
29
- def self.[] *args
30
- self.new *args
31
- end
32
-
33
- # Useful for feeding back into TypedStruct
34
- def name_type
35
- [name, type]
36
- end
37
- end
@@ -1,30 +0,0 @@
1
- # == SimpleDataTypes ==
2
- # int
3
- # long
4
- # double
5
- # arrays
6
- # chararray
7
- # bytearray
8
- #
9
- # == ComplexDataTypes ==
10
- # tuple
11
- # bag
12
- # map
13
-
14
- module Wukong
15
- module AndPig
16
- class PigVar
17
-
18
- end
19
- end
20
- end
21
-
22
- # class ScalarInteger < TypedStruct.new [
23
- # [:count, Integer ],
24
- # ]
25
- # include Wukong::AndPig::PigEmitter
26
- # def self.load_scalar path
27
- # var = super path
28
- # var.to_i
29
- # end
30
- # end
@@ -1,50 +0,0 @@
1
-
2
- # == Built-in Functions
3
- # EvalFunctions
4
- # AVG
5
- # CONCAT
6
- # COUNT
7
- # DIFF
8
- # MIN
9
- # MAX
10
- # SIZE
11
- # SUM
12
- # TOKENIZE
13
-
14
- # == NullOperators
15
- # isnull
16
- # isnotnull
17
- #
18
- # == BooleanOperators
19
- # and
20
- # or
21
- # not
22
- #
23
- # == DereferenceOperators
24
- # tupledereference.
25
- # mapdereference#
26
- #
27
- # == SignOperators
28
- # positive+
29
- # negative-
30
- #
31
- # == CastOperators
32
- # (type)$0
33
- # (type)alias
34
- #
35
- # == ArithmeticOperators
36
- # addition+
37
- # subtraction-
38
- # multiplication*
39
- # division/
40
- # modulo%
41
- # bincond?
42
- #
43
- # == ComparisonOperators
44
- # Equal==
45
- # notequal!=
46
- # lessthan<
47
- # greaterthan>
48
- # lessthanorequalto<=
49
- # greaterthanorequalto>=
50
- # patternmatchingmatches
@@ -1,85 +0,0 @@
1
- require 'wukong/and_pig/generate/variable_inflections'
2
-
3
- module Wukong
4
- module AndPig
5
-
6
- mattr_accessor :comments
7
- self.comments = true
8
- # send output to stdout or to captured pig instance
9
- mattr_accessor :emit_dest
10
- # full pathname to the pig executable
11
- PIG_EXECUTABLE = '/usr/local/bin/pig'
12
-
13
- def self.finish
14
- PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
15
- end
16
-
17
- #
18
- # All the embarrassing magick to pretend ruby symbols are pig relations
19
- #
20
- class PigVar
21
-
22
- # Output a command
23
- def self.emit cmd, semicolon=true
24
- cmd = cmd + ' ;' if semicolon
25
- case Wukong::AndPig.emit_dest
26
- when :captured
27
- pig_in_poke.puts(cmd)
28
- pig_in_poke.flush
29
- puts pig_in_poke.gets
30
- else
31
- puts(cmd)
32
- end
33
- end
34
-
35
- # generate the code
36
- def self.emit_setter relation, rval
37
- emit "%-23s\t= %s" % [relation, rval.cmd]
38
- rval
39
- end
40
-
41
- # generate the code
42
- def self.emit_imperative imperative, *rest
43
- cmd_part = "%-14s \t" % imperative
44
- arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
45
- emit cmd_part+arg_part
46
- rest.first
47
- end
48
-
49
- def self.pig_in_poke
50
- return @pig_in_poke if @pig_in_poke
51
- case Wukong::AndPig.emit_dest
52
- when :captured
53
- @pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
54
- @pig_in_poke.sync = true
55
- @pig_in_poke
56
- else @pig_in_poke = $stdout
57
- end
58
- end
59
-
60
- #
61
- # Reset the captured pig instance
62
- #
63
- def self.reset_pig_in_poke!
64
- begin pig_in_poke.close ; rescue nil ; end
65
- @pig_in_poke = nil
66
- end
67
-
68
- def set!
69
- self.class.emit_setter(relation, self)
70
- end
71
-
72
- #
73
- # Emit a comment
74
- # skips if Wukong::AndPig.comments is false
75
- #
76
- def self.rem comment
77
- return unless Wukong::AndPig.comments
78
- PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
79
- end
80
- end
81
-
82
- end
83
- end
84
-
85
-
@@ -1,82 +0,0 @@
1
- String.class_eval do
2
- #
3
- # Generate relation name from a handle
4
- #
5
- def relationize() camelize end
6
- end
7
- Symbol.class_eval do
8
- #
9
- # Generate relation name from a handle
10
- #
11
- def relationize
12
- to_s.relationize
13
- end
14
- end
15
-
16
- Object.class_eval do
17
- def typify() self.class ; end
18
-
19
- def symbolize
20
- self.to_s.underscore.gsub(%r{.*/}, '').to_sym
21
- end
22
- end
23
-
24
- class << Integer ; def typify() 'int' end ; end
25
- class << Bignum ; def typify() 'long' end ; end
26
- class << Float ; def typify() 'float' end ; end
27
- class << String ; def typify() 'chararray' end ; end
28
- class << Symbol ; def typify() self end ; end
29
- class << Date ; def typify() 'long' end ; end
30
-
31
- # Array.class_eval do
32
- # def typify()
33
- # "{ #{ map{|f,t| "#{f}: #{t.typify}"} } }"
34
- # end
35
- # end
36
- # class Tuple
37
- # attr_accessor :contents
38
- # def initialize *args
39
- # self.contents = args
40
- # end
41
- # def typify
42
- # "bag { #{ contents.map{|f,t| "#{f}: #{t.typify}"} } }"
43
- # end
44
- # #
45
- # # Sugar for creating a new bag. The following are equivalent:
46
- # #
47
- # # Bag[:foo]
48
- # # Bag.new :foo
49
- # #
50
- # def self.[] *args
51
- # new *args
52
- # end
53
- # end
54
-
55
- module BagMethods
56
- module ClassMethods
57
- #
58
- # Pig type string --
59
- # the pig type strings for each sub-element.
60
- #
61
- def typify
62
- vars_str = members.zip(mtypes).map do |attr, mtype|
63
- "%s: %s" % [attr, mtype.typify]
64
- end
65
- "{ #{vars_str.join(', ')} }"
66
- end
67
- end
68
- def self.included base
69
- base.extend ClassMethods
70
- end
71
- end
72
-
73
- class Bag < TypedStruct
74
- def self.new *args
75
- bag = super *args
76
- bag.class_eval{ include BagMethods }
77
- end
78
- def self.[] *args
79
- new *args
80
- end
81
- end
82
-
@@ -1,51 +0,0 @@
1
-
2
-
3
- module Wukong
4
- module AndPig
5
-
6
- #
7
- # Load the main class definitions
8
- #
9
- def self.init_load
10
- puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
11
- end
12
-
13
-
14
-
15
-
16
- #
17
- # OK we're going to cheat here:
18
- # just cat the file in, and treat it as a scalar
19
- #
20
- def load_scalar path
21
- # var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
22
- var = "636"
23
- end
24
-
25
-
26
-
27
- def count_distinct dest_rel, attr, group_by
28
- distincted =
29
- generate(temp_rel(dest_rel), attr).
30
- distinct(temp_rel(dest_rel), :parallel => 10)
31
- distincted.
32
- group( temp_rel(dest_rel), group_by).
33
- foreach( dest_rel, "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
34
- end
35
-
36
- #
37
- # Group a relation into bins, and return the counts for each bin
38
- # * dest_rel - Relation to store
39
- # {bin,
40
- #
41
- def histogram dest_rel, bin_attr, bin_expr=nil
42
- bin_expr ||= bin_attr
43
- bin_name = "#{bin_attr}_bin"
44
- binned = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
45
- binned. group( temp_rel(dest_rel), :by => bin_name).
46
- foreach( dest_rel, "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
47
- end
48
-
49
-
50
- end
51
- end
@@ -1,8 +0,0 @@
1
- require 'wukong/and_pig/operators/evaluators'
2
- require 'wukong/and_pig/operators/foreach'
3
- require 'wukong/and_pig/operators/groupies'
4
- require 'wukong/and_pig/operators/load_store'
5
- require 'wukong/and_pig/operators/meta'
6
- require 'wukong/and_pig/operators/relational'
7
- require 'wukong/and_pig/operators/file_methods'
8
- require 'wukong/and_pig/operators/compound'
@@ -1,29 +0,0 @@
1
- #
2
- # The FOREACH relational operator
3
- #
4
- module Wukong
5
- module AndPig
6
- class PigVar
7
- #
8
- # Select all elements in the source relation that match on the selecting relation,
9
- # creating a relation with the same type as the source relation.
10
- #
11
- # For example,
12
- #
13
- # PV.isolate :isolated_cvals, :my_ids, :id, :my_complicated_values, :id
14
- #
15
- # returns a relation IsolatedCvals, whose type is identical to
16
- # MyComplicatedValues' type, with only the elements having an id also
17
- # presend in MyIds.
18
- #
19
- #
20
- def self.isolate lval, on, on_field, from, from_field, options={ }
21
- joined = join anon(lval), on => on_field, from => from_field, :parallel => options.delete(:parallel)
22
- isolated = joined.generate lval, { "'#{from}'" => :rsrc}, *PV[from].fields.map{|field| [from, field]}
23
- isolated.klass = from.klass
24
- isolated
25
- end
26
-
27
- end
28
- end
29
- end