mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+
5
+ module Size
6
+ #
7
+ # Feed the entire dataset through wc and sum the results
8
+ #
9
+ class Script < Wukong::Script
10
+ #
11
+ # Don't implement a wukong script to do something if there's a unix command
12
+ # that does it faster: just override map_command or reduce_command in your
13
+ # subclass of Wukong::Script to return the complete command line
14
+ #
15
+ def map_command
16
+ '/usr/bin/wc'
17
+ end
18
+
19
+ # Make all records go to one reducer
20
+ def default_options
21
+ super.merge :reduce_tasks => 1
22
+ end
23
+ end
24
+
25
+ #
26
+ # Sums the numeric value of each column in its input
27
+ #
28
+ class Reducer < Wukong::Streamer::Base
29
+ attr_accessor :sums
30
+
31
+ #
32
+ # The unix +wc+ command uses whitespace, not tabs, so we'll recordize
33
+ # accordingly.
34
+ #
35
+ def recordize line
36
+ line.strip.split(/\s+/)
37
+ end
38
+
39
+ #
40
+ # add each corresponding column in the input
41
+ #
42
+ def process *vals
43
+ self.sums = vals.zip( sums || [] ).map{|val,sum| val.to_i + sum.to_i }
44
+ end
45
+
46
+ #
47
+ # run through the whole reduction input and then output the total
48
+ #
49
+ def stream *args
50
+ super *args
51
+ emit sums
52
+ end
53
+ end
54
+ end
55
+
56
+ # Execute the script
57
+ Size::Script.new(
58
+ nil,
59
+ Size::Reducer
60
+ ).run
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong'
4
+
5
+ module WordCount
6
+ class Mapper < Wukong::Streamer::LineStreamer
7
+ #
8
+ # Split a string into its constituent words.
9
+ #
10
+ # This is pretty simpleminded:
11
+ # * downcase the word
12
+ # * Split at any non-alphanumeric boundary, including '_'
13
+ # * However, preserve the special cases of 's or 't at the end of a
14
+ # word.
15
+ #
16
+ # tokenize("Jim's dawg won't hunt: dawg_hunt error #3007a4")
17
+ # # => ["jim's", "dawd", "won't", "hunt", "dawg", "hunt", "error", "3007a4"]
18
+ #
19
+ def tokenize str
20
+ return [] unless str
21
+ str = str.downcase;
22
+ # kill off all punctuation except [stuff]'s or [stuff]'t
23
+ # this includes hyphens (words are split)
24
+ str = str.
25
+ gsub(/[^a-zA-Z0-9\']+/, ' ').
26
+ gsub(/(\w)\'([st])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
27
+ # Busticate at whitespace
28
+ words = str.strip.split(/\s+/)
29
+ words.reject!{|w| w.blank? }
30
+ words
31
+ end
32
+
33
+ #
34
+ # Emit each word in each line.
35
+ #
36
+ def process line
37
+ tokenize(line).each{|word| yield [word, 1] }
38
+ end
39
+ end
40
+
41
+ #
42
+ # Accumulate the sum record-by-record:
43
+ #
44
+ class Reducer0 < Wukong::Streamer::Base
45
+ attr_accessor :key_count
46
+ def process word, count
47
+ @last_word ||= word
48
+ if (@last_word == word)
49
+ self.key_count += 1
50
+ else
51
+ yield [ @last_word, key_count ]
52
+ @last_word = word
53
+ end
54
+ end
55
+ def stream
56
+ emit @last_word, key_count
57
+ end
58
+ end
59
+
60
+ #
61
+ # You can stack up all the values in a list then sum them at once:
62
+ #
63
+ require 'active_support/core_ext/enumerable'
64
+ class Reducer1 < Wukong::Streamer::ListReducer
65
+ def finalize
66
+ yield [ key, values.map(&:last).map(&:to_i).sum ]
67
+ end
68
+ end
69
+
70
+ #
71
+ # A bit kinder to your memory manager: accumulate the sum record-by-record:
72
+ #
73
+ class Reducer2 < Wukong::Streamer::AccumulatingReducer
74
+ attr_accessor :key_count
75
+ def start!(*args) self.key_count = 0 end
76
+ def accumulate(*args) self.key_count += 1 end
77
+ def finalize
78
+ yield [ key, key_count ]
79
+ end
80
+ end
81
+
82
+ #
83
+ # ... easiest of all, though: this is common enough that it's already included
84
+ #
85
+ require 'wukong/streamer/count_keys'
86
+ class Reducer3 < Wukong::Streamer::CountKeys
87
+ end
88
+
89
+ end
90
+
91
+ # Execute the script
92
+ Wukong::Script.new(
93
+ WordCount::Mapper,
94
+ WordCount::Reducer1
95
+ ).run
@@ -0,0 +1,11 @@
1
+ require 'wukong/boot'
2
+ require 'wukong/extensions'
3
+ require 'wukong/datatypes'
4
+ require 'wukong/logger'
5
+ require 'wukong/bad_record'
6
+ autoload :TypedStruct, 'wukong/typed_struct'
7
+ module Wukong
8
+ autoload :Dfs, 'wukong/dfs'
9
+ autoload :Script, 'wukong/script'
10
+ autoload :Streamer, 'wukong/streamer'
11
+ end
@@ -0,0 +1,62 @@
1
+ require 'wukong/and_pig/pig_var'
2
+ require 'wukong/and_pig/as'
3
+ require 'wukong/and_pig/functions'
4
+ require 'wukong/and_pig/operators'
5
+ require 'wukong/and_pig/data_types'
6
+ require 'wukong/and_pig/pig_struct'
7
+ require 'wukong/and_pig/generate'
8
+ require 'wukong/and_pig/symbol'
9
+ require 'wukong/and_pig/utils'
10
+
11
+ module Wukong
12
+ #
13
+ # Wukong::AndPig lets you generate and run pig[http://hadoop.apache.org/pig]
14
+ # code from within ruby (and interactively, from the +irb+ console).
15
+ #
16
+ # It uses the same typed structures you've defined for Wukong to create
17
+ # pig-types aware commands. For example, the Wukong class
18
+ #
19
+ # class Customer < TypedStruct.new( [:id, Integer],
20
+ # [:name, String], [:postal_code, Integer], [:balance, Float] )
21
+ # end
22
+ #
23
+ # will generate a LOAD command for pig as
24
+ #
25
+ # Customer1.pig_load('q4_reports/customers.tsv').set!
26
+ # # => Q4ReportsCustomers2 = LOAD 'q4_reports/customers.tsv'
27
+ # AS (id: int, name: chararray, postal_code: int, balance: float) ;
28
+ #
29
+ # You can write anonymous chains
30
+ #
31
+ # q1 = Customer1.
32
+ # pig_load('q4_reports/customers.tsv').set!.
33
+ # distinct.set! ;
34
+ # q1.
35
+ # group(:by => :postal_code).set!.
36
+ # generate([:group, :postal_code], ["COUNT(#{q1.relation})", :customers_per_zip]).set!.
37
+ # store!
38
+ #
39
+ # Q4ReportsCustomers35 = LOAD 'q4_reports/customers.tsv' AS (id: int,name: chararray,postal_code: int,balance: float) ;
40
+ # Q4ReportsCustomers36 = DISTINCT Q4ReportsCustomers35 ;
41
+ # Q4ReportsCustomers37 = GROUP Q4ReportsCustomers36 BY postal_code ;
42
+ # Q4ReportsCustomers38 = FOREACH Q4ReportsCustomers37 GENERATE
43
+ # group AS postal_code,
44
+ # COUNT(Q4ReportsCustomers36) AS customers_per_zip ;
45
+ #
46
+ # ---------------------------------------------------------------------------
47
+ #
48
+ # Note on pig:
49
+ #
50
+ # 1) Reverse the order of your tables in your join statement. Pig always
51
+ # streams the keys of the last input, (materializing in memory the keys of
52
+ # the first), so if one of your inputs has less instances of of a given key
53
+ # this may help.
54
+ #
55
+ # 2) Reduce the number of maps and reducers per machine and give it all the
56
+ # memory you can.
57
+ #
58
+ #
59
+ module AndPig
60
+ end
61
+ end
62
+
@@ -0,0 +1,12 @@
1
+ Wukong::AndPig is a small library to more easily generate code for the
2
+ "Pig":http://hadoop.apache.org/pig data analysis language.
3
+
4
+ Wukong::AndPig lets you use the structs from your Wukong scripts to
5
+ generate Pig instructions that know their types and structure -- even through
6
+ multiple pig commands. For example, if you use +FOREACH ... GENERATE+ to select
7
+ only a few of those fields, Wukong::AndPig will know that the result has only
8
+ those fields.
9
+
10
+ We're still trying to figure out if this is a stupid and crazy idea, or just a
11
+ crazy idea: Yeah, we're using a functional/OO scripting language to generate code for an
12
+ imperative query language that generates Java code for ad-hoc map-reduce operations.
@@ -0,0 +1,37 @@
1
+ class AS
2
+ attr_accessor :expr, :name, :type, :ref, :options
3
+ def initialize expr, name=nil, type=nil, ref=nil, *option_flags
4
+ case expr
5
+ when AS
6
+ self.expr = expr.expr
7
+ self.name = expr.name
8
+ self.type = expr.type
9
+ self.ref = expr.ref
10
+ self.options = expr.options
11
+ end
12
+ self.expr ||= expr
13
+ self.name = name if name
14
+ self.type = type if type
15
+ self.ref = ref if ref
16
+ self.options ||= { }
17
+ option_flags.each{|option| self.options[option] = true }
18
+ end
19
+
20
+ def to_s
21
+ clause = "%-30s \t" % [ref, expr].compact.join('::')
22
+ if name
23
+ clause << "AS #{name}" unless options[:skip_name]
24
+ clause << ":#{type.typify}" unless ((!type) || options[:skip_type])
25
+ end
26
+ clause
27
+ end
28
+
29
+ def self.[] *args
30
+ self.new *args
31
+ end
32
+
33
+ # Useful for feeding back into TypedStruct
34
+ def name_type
35
+ [name, type]
36
+ end
37
+ end
@@ -0,0 +1,30 @@
1
+ # == SimpleDataTypes ==
2
+ # int
3
+ # long
4
+ # double
5
+ # arrays
6
+ # chararray
7
+ # bytearray
8
+ #
9
+ # == ComplexDataTypes ==
10
+ # tuple
11
+ # bag
12
+ # map
13
+
14
+ module Wukong
15
+ module AndPig
16
+ class PigVar
17
+
18
+ end
19
+ end
20
+ end
21
+
22
+ # class ScalarInteger < TypedStruct.new [
23
+ # [:count, Integer ],
24
+ # ]
25
+ # include Wukong::AndPig::PigEmitter
26
+ # def self.load_scalar path
27
+ # var = super path
28
+ # var.to_i
29
+ # end
30
+ # end
@@ -0,0 +1,50 @@
1
+
2
+ # == Built-in Functions
3
+ # EvalFunctions
4
+ # AVG
5
+ # CONCAT
6
+ # COUNT
7
+ # DIFF
8
+ # MIN
9
+ # MAX
10
+ # SIZE
11
+ # SUM
12
+ # TOKENIZE
13
+
14
+ # == NullOperators
15
+ # isnull
16
+ # isnotnull
17
+ #
18
+ # == BooleanOperators
19
+ # and
20
+ # or
21
+ # not
22
+ #
23
+ # == DereferenceOperators
24
+ # tupledereference.
25
+ # mapdereference#
26
+ #
27
+ # == SignOperators
28
+ # positive+
29
+ # negative-
30
+ #
31
+ # == CastOperators
32
+ # (type)$0
33
+ # (type)alias
34
+ #
35
+ # == ArithmeticOperators
36
+ # addition+
37
+ # subtraction-
38
+ # multiplication*
39
+ # division/
40
+ # modulo%
41
+ # bincond?
42
+ #
43
+ # == ComparisonOperators
44
+ # Equal==
45
+ # notequal!=
46
+ # lessthan<
47
+ # greaterthan>
48
+ # lessthanorequalto<=
49
+ # greaterthanorequalto>=
50
+ # patternmatchingmatches
@@ -0,0 +1,85 @@
1
+ require 'wukong/and_pig/generate/variable_inflections'
2
+
3
+ module Wukong
4
+ module AndPig
5
+
6
+ mattr_accessor :comments
7
+ self.comments = true
8
+ # send output to stdout or to captured pig instance
9
+ mattr_accessor :emit_dest
10
+ # full pathname to the pig executable
11
+ PIG_EXECUTABLE = '/usr/local/bin/pig'
12
+
13
+ def self.finish
14
+ PigVar.pig_in_poke.close if PigVar.pig_in_poke.respond_to?(:close)
15
+ end
16
+
17
+ #
18
+ # All the embarrassing magick to pretend ruby symbols are pig relations
19
+ #
20
+ class PigVar
21
+
22
+ # Output a command
23
+ def self.emit cmd, semicolon=true
24
+ cmd = cmd + ' ;' if semicolon
25
+ case Wukong::AndPig.emit_dest
26
+ when :captured
27
+ pig_in_poke.puts(cmd)
28
+ pig_in_poke.flush
29
+ puts pig_in_poke.gets
30
+ else
31
+ puts(cmd)
32
+ end
33
+ end
34
+
35
+ # generate the code
36
+ def self.emit_setter relation, rval
37
+ emit "%-23s\t= %s" % [relation, rval.cmd]
38
+ rval
39
+ end
40
+
41
+ # generate the code
42
+ def self.emit_imperative imperative, *rest
43
+ cmd_part = "%-14s \t" % imperative
44
+ arg_part = rest.map{|s| "%14s" % s.to_s }.join(" \t")
45
+ emit cmd_part+arg_part
46
+ rest.first
47
+ end
48
+
49
+ def self.pig_in_poke
50
+ return @pig_in_poke if @pig_in_poke
51
+ case Wukong::AndPig.emit_dest
52
+ when :captured
53
+ @pig_in_poke = IO.popen(PIG_EXECUTABLE, "w+")
54
+ @pig_in_poke.sync = true
55
+ @pig_in_poke
56
+ else @pig_in_poke = $stdout
57
+ end
58
+ end
59
+
60
+ #
61
+ # Reset the captured pig instance
62
+ #
63
+ def self.reset_pig_in_poke!
64
+ begin pig_in_poke.close ; rescue nil ; end
65
+ @pig_in_poke = nil
66
+ end
67
+
68
+ def set!
69
+ self.class.emit_setter(relation, self)
70
+ end
71
+
72
+ #
73
+ # Emit a comment
74
+ # skips if Wukong::AndPig.comments is false
75
+ #
76
+ def self.rem comment
77
+ return unless Wukong::AndPig.comments
78
+ PigVar.emit comment.gsub(/(^|\n)(#([\t ]|$))?/, "\n-- "), false
79
+ end
80
+ end
81
+
82
+ end
83
+ end
84
+
85
+