mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,65 @@
1
+ # == Load/StoreFunctions ==
2
+ # BinaryDeserializer
3
+ # BinarySerializer
4
+ # BinStorage
5
+ # PigStorage
6
+ # PigDump
7
+ # TextLoader
8
+
9
+ module Wukong
10
+ module AndPig
11
+ class PigVar
12
+ #===========================================================================
13
+ #
14
+ # The "LOAD" pig expression:
15
+ # MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
16
+ #
17
+ # The AS type spec is generated from klass
18
+ #
19
+ def self.pig_load rel, klass, options={ }
20
+ filename = options[:filename] || default_filename(rel)
21
+ self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
22
+ if options[:has_rsrc]
23
+ lval = self[rel]
24
+ lval.generate lval, *lval.fields
25
+ end
26
+ rel
27
+ end
28
+
29
+ #===========================================================================
30
+ #
31
+ #
32
+ # The "STORE" pig imperative:
33
+ # STORE Relation INTO 'filename'
34
+ # If no filename is given, the relation's name is used
35
+ #
36
+ def store filename=nil
37
+ filename ||= default_filename
38
+ self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
39
+ self
40
+ end
41
+
42
+ # Store the relation, removing the existing file
43
+ def store! filename=nil
44
+ filename ||= default_filename
45
+ rmf! filename
46
+ mkdir File.dirname(filename)
47
+ store filename
48
+ end
49
+
50
+ # Force a store to disk, then load (so all calculations proceed from there)
51
+ def checkpoint! options={}
52
+ options = options.reverse_merge :filename => default_filename
53
+ store! options[:filename]
54
+ self.class.pig_load(self.name, klass, options)
55
+ end
56
+
57
+ def default_filename
58
+ self.class.default_filename self.name
59
+ end
60
+ def self.default_filename name
61
+ File.join(working_dir, name.to_s)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,42 @@
1
+ # == DiagnosticOperators
2
+ # describe
3
+ # dump
4
+ # explain
5
+ # illustrate
6
+ # == UDFStatements
7
+ # define
8
+ # register
9
+
10
+ module Wukong
11
+ module AndPig
12
+ class PigVar
13
+ # DESCRIBE pig imperative
14
+ def describe
15
+ self.class.describe self
16
+ end
17
+ def self.describe rel
18
+ emit %Q{ -- PREDICTED: #{rel.klass.typify} }
19
+ simple_declaration :describe, rel.relationize
20
+ rel
21
+ end
22
+
23
+ # DUMP pig imperative
24
+ def dump() simple_operation :dump end
25
+
26
+ # EXPLAIN pig imperative
27
+ def explain() simple_operation :explain end
28
+
29
+ # ILLUSTRATE pig imperative
30
+ def illustrate() simple_operation :illustrate end
31
+
32
+
33
+ def self.define pig_alias, *args
34
+ emit_imperative :DEFINE, pig_alias, args
35
+ end
36
+
37
+ def self.register path_to_jar
38
+ emit_imperative :REGISTER, path_to_jar
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,129 @@
1
+ # -*- coding: utf-8 -*-
2
+ # == RelationalOperators
3
+ #
4
+ # GROUP, COGROUP, JOIN see groupies.rb
5
+ # CROSS see
6
+
7
+ # distinct
8
+ # filter
9
+ # limit
10
+ # order
11
+ # split
12
+ # union
13
+
14
+ #
15
+ # stream
16
+ # load
17
+ # store
18
+ #
19
+ module Wukong
20
+ module AndPig
21
+ class PigVar
22
+
23
+ # ===========================================================================
24
+ #
25
+ # Options
26
+ #
27
+ def self.parallelize! str, options
28
+ str << " PARALLEL #{options[:parallel]}" if options[:parallel]
29
+ end
30
+
31
+ # ===========================================================================
32
+ #
33
+ # DISTINCT
34
+ #
35
+ def distinct lval, options={}
36
+ self.class.distinct lval, self, options
37
+ end
38
+
39
+ def self.distinct lval, rel, options={ }
40
+ cmd_str = rel.relationize
41
+ parallelize! cmd_str, options
42
+ simple_operation lval, rel, :distinct, cmd_str
43
+ end
44
+
45
+ # ===========================================================================
46
+ #
47
+ # FILTER
48
+ #
49
+ def filter by_str
50
+ new_in_chain klass, "FILTER #{relation} BY #{by_str}"
51
+ end
52
+ def self.filter lval, rel, by_str
53
+ simple_operation lval, rel, "FILTER", "#{rel.relation} BY #{by_str}"
54
+ end
55
+
56
+ # ===========================================================================
57
+ #
58
+ # LIMIT
59
+ #
60
+ def limit n
61
+ new_in_chain klass, "LIMIT #{relation} #{n}"
62
+ end
63
+
64
+ # ===========================================================================
65
+ #
66
+ # ORDER
67
+ #
68
+ # alias = ORDER alias BY { * [ASC|DESC] |
69
+ # field_alias [ASC|DESC] [, field_alias [ASC|DESC] …]
70
+ # } [PARALLEL n];
71
+ #
72
+ def order cmd_str, options={}
73
+ result = new_in_chain klass, "ORDER #{relation} BY #{cmd_str}"
74
+ parallelize! result.cmd, options
75
+ result
76
+ end
77
+
78
+ # ===========================================================================
79
+ #
80
+ # SPLIT
81
+ #
82
+ # SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
83
+ #
84
+ #
85
+ def split relation_tests={}
86
+ split_str = relation_tests.map do |out_rel, test|
87
+ "#{out_rel} IF #{test}"
88
+ end.join(", ")
89
+ new_in_chain klass, "SPLIT #{relation} INTO #{split_str}"
90
+ end
91
+
92
+ # ===========================================================================
93
+ #
94
+ # CROSS
95
+ #
96
+ def cross *relations
97
+ options = relations.extract_options!
98
+ raise CrossArgumentError unless relations.length >= 1
99
+ relations_str = [self, *relations].map(&:relation).join(", ")
100
+ result = new_in_chain relations.first.klass, "CROSS #{relations_str}"
101
+ parallelize! result.cmd, options
102
+ result
103
+ end
104
+
105
+ # ===========================================================================
106
+ #
107
+ # UNION
108
+ #
109
+ # def self.union *relations
110
+ # raise UnionArgumentError unless relations.length >= 2
111
+ # new_in_chain relations.first.klass, "UNION #{relations}"
112
+ # end
113
+
114
+ # UNION as method
115
+ def union lval, *relations
116
+ self.class.union lval, [self]+relations
117
+ end
118
+
119
+ def self.union lval, *relations
120
+ raise UnionArgumentError unless relations.length >= 2
121
+ relations_str = relations.map(&:relation).join(", ")
122
+ simple_operation lval, relations.first, :union, relations_str
123
+ end
124
+
125
+ end
126
+ CrossArgumentError = ArgumentError.new("CROSS requires at least two relations. Heh heh: relations.")
127
+ UnionArgumentError = ArgumentError.new("UNION requires at least two relations. Heh heh: relations.")
128
+ end
129
+ end
@@ -0,0 +1,48 @@
1
+ module Wukong
2
+ module PigStructMethods
3
+ module ClassMethods
4
+ #
5
+ # Pig type string --
6
+ # the pig type strings for each sub-element.
7
+ #
8
+ def typify has_rsrc=nil
9
+ vars_str = members.zip(mtypes).map do |attr, mtype|
10
+ "%s: %s" % [attr, mtype.typify]
11
+ end
12
+ vars_str = ["rsrc: chararray"] + vars_str if has_rsrc
13
+ "(#{vars_str.join(', ')})"
14
+ end
15
+
16
+ #
17
+ #
18
+ #
19
+ def pig_load rel, *args
20
+ Wukong::AndPig::PigVar.pig_load rel, self, *args
21
+ end
22
+
23
+ #
24
+ # Returns type for a fieldspec
25
+ #
26
+ def field_type field
27
+ case field
28
+ when Symbol then members_types[field]
29
+ # when Array
30
+ # if field.length > 1 then members_types[field.first].field_type(field[1..-1])
31
+ # else field_type field.first
32
+ # end
33
+ end
34
+ end
35
+
36
+ end
37
+ def self.included base
38
+ base.extend ClassMethods
39
+ end
40
+ end
41
+ end
42
+
43
+ Struct.class_eval do
44
+ include Wukong::PigStructMethods
45
+ def self.mtypes
46
+ members
47
+ end
48
+ end
@@ -0,0 +1,95 @@
1
+ module Wukong
2
+ module AndPig
3
+
4
+ #
5
+ # Make a PigVar understand the struct it describes
6
+ #
7
+ class PigVar
8
+ attr_accessor :klass, :name, :cmd
9
+ cattr_accessor :working_dir ; self.working_dir = '.'
10
+ def initialize klass, name, cmd
11
+ self.klass = klass
12
+ self.name = name
13
+ self.cmd = cmd
14
+ end
15
+
16
+ # Sugar for PigVar.new_relation
17
+ def self.[]= name, *args
18
+ set name, *args
19
+ end
20
+ # Sugar for PigVar.new_relation
21
+ def self.[] name
22
+ PIG_SYMBOLS[name]
23
+ end
24
+
25
+ # extract a field from an alias
26
+ def _ field
27
+ as_name = [name, field].join("_").to_sym
28
+ AS["#{relationize}.(#{field})", as_name, Bag.new([field, field_type(field)]), nil, :skip_type]
29
+ end
30
+
31
+
32
+ def self.set name, rval
33
+ PIG_SYMBOLS[name] = rval
34
+ rval.name = name
35
+ emit_setter rval.relation, rval
36
+ end
37
+
38
+ def relation
39
+ name.relationize
40
+ end
41
+ alias_method :relationize, :relation
42
+
43
+ #
44
+ # Create a name for a new anonymous relation
45
+ #
46
+ def self.anon slug
47
+ idx = (Wukong::AndPig.anon_var_idx += 1)
48
+ "anon_#{slug}_#{idx}_".to_sym
49
+ end
50
+ # Create a name building off this one
51
+ def anon
52
+ slug = name.to_s.gsub(/^anon_/,'').gsub(/_\d+_$/,'')
53
+ self.class.anon slug
54
+ end
55
+
56
+ #
57
+ def new_in_chain lval, l_klass, l_cmd
58
+ rval = self.class.new l_klass, lval, l_cmd
59
+ self.class.set lval, rval
60
+ end
61
+
62
+ # Delegate to klass
63
+ def field_type *args
64
+ self.klass.field_type *args
65
+ end
66
+
67
+ # Fields in this relation
68
+ def fields
69
+ klass.members.map(&:to_sym)
70
+ end
71
+
72
+ #
73
+ # Side-effect free operation
74
+ #
75
+ def simple_operation op
76
+ self.class.emit "#{op.to_s.upcase} #{relation}"
77
+ self
78
+ end
79
+
80
+ def self.simple_operation lval, rel, op, r_str
81
+ cmd = "%-8s %s" % [op.to_s.upcase, r_str]
82
+ rval = new(rel.klass, lval, cmd)
83
+ set lval, rval
84
+ end
85
+
86
+ def self.simple_declaration op, r_str
87
+ cmd = "%-8s %s" % [op.to_s.upcase, r_str]
88
+ emit cmd
89
+ end
90
+
91
+ end
92
+ end
93
+ end
94
+
95
+
@@ -0,0 +1,29 @@
1
+ module Wukong
2
+ module AndPig
3
+ PIG_SYMBOLS = { }
4
+ mattr_accessor :anon_var_idx
5
+ self.anon_var_idx = 0
6
+ end
7
+ end
8
+
9
+
10
+ Symbol.class_eval do
11
+ def << relation
12
+ case
13
+ when relation.is_a?(Wukong::AndPig::PigVar)
14
+ Wukong::AndPig::PigVar.new_relation(self, relation)
15
+ when relation.is_a?(Symbol) && (pig_var = Wukong::AndPig::PIG_SYMBOLS[relation])
16
+ Wukong::AndPig::PigVar.new_relation(self, pig_var)
17
+ else raise "Don't know how to pigify RHS #{relation.inspect}"
18
+ end
19
+ end
20
+
21
+ def method_missing method, *args
22
+ pig_var = Wukong::AndPig::PIG_SYMBOLS[self]
23
+ if pig_var && pig_var.respond_to?(method)
24
+ pig_var.send(method, *args)
25
+ else
26
+ super method, *args
27
+ end
28
+ end
29
+ end
File without changes
@@ -0,0 +1,18 @@
1
+ #
2
+ # Easily serialize bad records in-band, for later analysis or to discard if
3
+ # neglectable.
4
+ #
5
+ # You can instantiate this as
6
+ # success = do_stuff_to record
7
+ # if ! success
8
+ # return BadRecord.new("do_stuff_to-failed", record)
9
+ # end
10
+ #
11
+ class BadRecord < Struct.new(
12
+ :errors,
13
+ :record
14
+ )
15
+ def initialize errors='', *record_fields
16
+ super errors, record_fields
17
+ end
18
+ end
@@ -0,0 +1,47 @@
1
+ module Wukong
2
+
3
+ # ---------------------------------------------------------------------------
4
+ #
5
+ # Default options for Wukong
6
+ # http://github.com/infochimps/wukong
7
+ #
8
+ # If you set an environment variable WUKONG_CONFIG, *or* if the file
9
+ # $HOME/.wukong.rb exists, that file will be +require+'d as well.
10
+ #
11
+ # Important values to set:
12
+ #
13
+ # * Wukong::CONFIG[:hadoop_home] --
14
+ # Path to root of hadoop install. If your hadoop runner is
15
+ # /usr/local/share/hadoop/bin/hadoop
16
+ # then your hadoop_home is
17
+ # /usr/local/share/hadoop.
18
+ # You can also set a
19
+ #
20
+ # * Wukong::CONFIG[:default_run_mode] -- Whether to run using hadoop (and
21
+ # thus, requiring a working hadoop install), or to run in local mode
22
+ # (script --map | sort | script --reduce)
23
+ #
24
+ CONFIG = {
25
+ # Run as local or as hadoop?
26
+ :default_run_mode => 'hadoop',
27
+
28
+ # The command to run when a nil mapper or reducer is given.
29
+ :default_mapper => '/bin/cat',
30
+ :default_reducer => '/bin/cat',
31
+
32
+ # Anything in HADOOP_OPTIONS_MAP (see lib/wukong/script/hadoop_command.rb)
33
+ :runner_defaults => {
34
+ },
35
+ }
36
+
37
+ def self.config_options
38
+ # # override with site-specific options
39
+ site_config_filename = ENV['WUKONG_CONFIG'] || (ENV['HOME'].to_s+'/.wukong.rb')
40
+ require site_config_filename.gsub(/\.rb$/,'') if File.exists?(site_config_filename)
41
+
42
+ # try to guess a hadoop_home if none given
43
+ Wukong::CONFIG[:hadoop_home] ||= ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
44
+ end
45
+ self.config_options
46
+ end
47
+