wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,47 @@
1
+ require 'wukong/extensions/class'
2
+ module Wukong
3
+
4
+ module HashlikeClass
5
+ module ClassMethods
6
+ def has_members *members
7
+ self.members ||= []
8
+ self.members = members.map(&:to_s) + self.members
9
+ self.members.each do |member|
10
+ attr_accessor member.to_sym
11
+ end
12
+ end
13
+ alias_method :has_member, :has_members
14
+ def keys
15
+ members
16
+ end
17
+ end
18
+
19
+ def [](key)
20
+ self.send(key)
21
+ end
22
+
23
+ def []=(key, val)
24
+ self.send("#{key}=", val)
25
+ end
26
+
27
+ def to_a
28
+ values_of(*members)
29
+ end
30
+
31
+ def to_flat
32
+ to_a.map(&:to_flat).flatten
33
+ end
34
+
35
+ def self.included base
36
+ base.class_eval do
37
+ extend ClassMethods
38
+ include HashLike
39
+ class_inheritable_accessor :members
40
+
41
+ def to_hash *args
42
+ super(*args).merge 'type' => self.class.to_s
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,2 @@
1
+ require 'extlib/object'
2
+ require 'extlib/module'
@@ -0,0 +1,27 @@
1
+ require 'pathname'
2
+ class Pathname
3
+ # Append path segments and expand to absolute path
4
+ #
5
+ # file = Pathname(Dir.pwd) / "subdir1" / :subdir2 / "filename.ext"
6
+ #
7
+ # @param [Pathname, String, #to_s] path path segment to concatenate with receiver
8
+ #
9
+ # @return [Pathname]
10
+ # receiver with _path_ appended and expanded to an absolute path
11
+ #
12
+ # @api public
13
+ def /(path)
14
+ (self + path).expand_path
15
+ end
16
+
17
+ def self.[](*vals)
18
+ new( File.join(vals) )
19
+ end
20
+ end
21
+
22
+ class Subdir < Pathname
23
+ def self.[](*vals)
24
+ dir = File.dirname(vals.shift)
25
+ new(File.join(dir, *vals))
26
+ end
27
+ end
@@ -0,0 +1,65 @@
1
+ #
2
+ # String Monkeypatched for processing with wukong: see wukong/extensions/string
3
+ #
4
+ String.class_eval do
5
+ # By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
6
+ # is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
7
+ #
8
+ # +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
9
+ #
10
+ # Examples:
11
+ # "active_record".camelize # => "ActiveRecord"
12
+ # "active_record".camelize(:lower) # => "activeRecord"
13
+ # "active_record/errors".camelize # => "ActiveRecord::Errors"
14
+ # "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
15
+ def camelize(first_letter_in_uppercase = true)
16
+ if first_letter_in_uppercase
17
+ self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
18
+ else
19
+ self.first + camelize(self)[1..-1]
20
+ end
21
+ end
22
+
23
+ #
24
+ # The reverse of +camelize+. Makes an underscored, lowercase form from the expression in the string.
25
+ #
26
+ # Changes '::' to '/' to convert namespaces to paths.
27
+ #
28
+ # Examples:
29
+ # "ActiveRecord".underscore # => "active_record"
30
+ # "ActiveRecord::Errors".underscore # => active_record/errors
31
+ #
32
+ # Stolen from active_support
33
+ #
34
+ def underscore
35
+ gsub(/::/, '/').
36
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
37
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
38
+ tr("-", "_").
39
+ downcase
40
+ end
41
+
42
+ # Tries to find a constant with the name specified in the argument string:
43
+ #
44
+ # "Module".constantize # => Module
45
+ # "Test::Unit".constantize # => Test::Unit
46
+ #
47
+ # The name is assumed to be the one of a top-level constant, no matter whether
48
+ # it starts with "::" or not. No lexical context is taken into account:
49
+ #
50
+ # C = 'outside'
51
+ # module M
52
+ # C = 'inside'
53
+ # C # => 'inside'
54
+ # "C".constantize # => 'outside', same as ::C
55
+ # end
56
+ #
57
+ # NameError is raised when the name is not in CamelCase or the constant is
58
+ # unknown.
59
+ def constantize
60
+ unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ self
61
+ raise NameError, "#{self.inspect} is not a valid constant name!"
62
+ end
63
+ Object.module_eval("::#{$1}", __FILE__, __LINE__)
64
+ end
65
+ end
@@ -0,0 +1,17 @@
1
+ require 'wukong/extensions/hash'
2
+ require 'wukong/extensions/hash_like'
3
+ require 'wukong/extensions/symbol'
4
+
5
+ #
6
+ # extensions/struct
7
+ #
8
+ # Add several methods to make a struct duck-type much more like a Hash
9
+ #
10
+ Struct.class_eval do
11
+ include Wukong::HashLike
12
+ def self.keys
13
+ members
14
+ end
15
+ end
16
+
17
+
@@ -0,0 +1,11 @@
1
+ #
2
+ # h2. extensions/symbol.rb -- extensions to symbol class
3
+ #
4
+ class Symbol
5
+ #
6
+ # Turn the symbol into a simple proc (stolen from
7
+ # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
8
+ def to_proc
9
+ Proc.new { |*args| args.shift.__send__(self, *args) }
10
+ end
11
+ end
@@ -0,0 +1,53 @@
1
+ module Wukong
2
+ # Common logger
3
+ #
4
+ # Set your own at any time with
5
+ # Wukong.logger = YourAwesomeLogger.new(...)
6
+ # If you have log4r installed you can use
7
+ # Wukong.logger = Wukong.default_log4r_logger
8
+ #
9
+ # If Wukong.logger is too much typing for you,
10
+ # use the Log constant
11
+ #
12
+ # Default format:
13
+ # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
+ #
15
+ def self.logger
16
+ @logger ||= default_ruby_logger
17
+ end
18
+
19
+ #
20
+ # Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
21
+ # friendly) output lines
22
+ #
23
+ def self.default_log4r_logger logger_handle='wukong'
24
+ require 'log4r'
25
+ lgr = Log4r::Logger.new logger_handle
26
+ outputter = Log4r::Outputter.stderr
27
+ # Define timestamp formatter method
28
+ ::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
29
+ # 2009-07-25T00:12:05Z INFO PID\t
30
+ outputter.formatter = Log4r::PatternFormatter.new(
31
+ :pattern => "%d %.4l #{Process.pid}\t%.2000m",
32
+ :date_method => :utc_iso8601
33
+ )
34
+ lgr.outputters = outputter
35
+ lgr
36
+ end
37
+
38
+ def self.default_ruby_logger
39
+ require 'logger'
40
+ Logger.new STDERR
41
+ end
42
+
43
+ def self.logger= logger
44
+ @logger = logger
45
+ end
46
+ end
47
+
48
+ #
49
+ # A convenient logger.
50
+ #
51
+ # Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
52
+ #
53
+ Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
@@ -0,0 +1,27 @@
1
+
2
+ module Wukong
3
+ module Models
4
+ class Edge < TypedStruct.new(
5
+ [:src, Integer],
6
+ [:dest, Integer]
7
+ )
8
+ end
9
+
10
+ class MultiEdge < TypedStruct.new(
11
+ [:src, Integer],
12
+ [:dest, Integer],
13
+ [:a_follows_b, Integer],
14
+ [:b_follows_a, Integer],
15
+ [:a_replies_b, Integer],
16
+ [:b_replies_a, Integer],
17
+ [:a_atsigns_b, Integer],
18
+ [:b_atsigns_a, Integer],
19
+ [:a_retweets_b, Integer],
20
+ [:b_retweets_a, Integer],
21
+ [:a_favorites_b, Integer],
22
+ [:b_favorites_a, Integer]
23
+ )
24
+ end
25
+
26
+ end
27
+ end
data/lib/wukong/rdf.rb ADDED
@@ -0,0 +1,104 @@
1
+ module Wukong
2
+ #
3
+ # Dump wukong object as RDF triples:
4
+ #
5
+ # <key attr val module Wukong
6
+ #
7
+ # Dump wukong object as RDF triples:
8
+ #
9
+ # <key> <attr> <val> # <extra>
10
+ #
11
+ # Each element of the triple is XML encoded such that it contains no tab,
12
+ # newline or carriage returns, and the three are tab-separated. Any extra
13
+ # fields -- reification info, for instance -- are appended as a comment.
14
+ #
15
+ # This makes the result not only a valid RDF triple file but perfectly
16
+ # palatable to Wukong for further processing.
17
+ #
18
+ module Rdf
19
+
20
+ #
21
+ # RDF-formatted date
22
+ #
23
+ def self.encode_datetime dt
24
+ DateTime.parse_safely(dt).xmlschema
25
+ end
26
+
27
+ #
28
+ # Emit a component (subject or object) with the right semantic encoding
29
+ #
30
+ # Use :boolskip if a false property should just be left out.
31
+ #
32
+ def rdf_component val, type
33
+ case type
34
+ when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
35
+ when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
36
+ when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
37
+ when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
38
+ when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
39
+ when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
40
+ when :str then %Q{"#{val}"}
41
+ else raise "Don't know how to encode #{type}"
42
+ end
43
+ end
44
+
45
+ #
46
+ # Express relationship (predicate) in RDF
47
+ #
48
+ def rdf_pred pred
49
+ case pred
50
+ when :created_at then %Q{<http://twitter.com/##{pred}>}
51
+ else %Q{<http://twitter.com/##{pred}>}
52
+ end
53
+ end
54
+
55
+ #
56
+ # RDF Triple string for the given (subject, object, predicate)
57
+ # http://www.w3.org/TR/rdf-testcases/#ntriples
58
+ #
59
+ def self.rdf_triple subj, pred, obj, comment=nil
60
+ comment = "\t# " + comment.to_s unless comment.blank?
61
+ %Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
62
+ end
63
+
64
+ def mutable?(attr)
65
+ false
66
+ end
67
+
68
+ #
69
+ # Extract [subject, predicate, object, (extra)] tuples.
70
+ #
71
+ # (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
72
+ #
73
+ def to_rdf3_tuples
74
+ members_with_types.map do |attr, type|
75
+ next if self[attr].blank?
76
+ subj = rdf_resource
77
+ pred = rdf_pred(attr)
78
+ obj = rdf_component(self[attr], type) or next
79
+ comment = scraped_at if mutable?(attr)
80
+ [subj, pred, obj, comment]
81
+ end.compact
82
+ end
83
+
84
+ #
85
+ # Convert an object to an rdf triple.
86
+ #
87
+ # Appends scraped at to #mutable? attributes
88
+ #
89
+ def to_rdf3
90
+ to_rdf3_tuples.map do |tuple|
91
+ self.class.rdf_triple tuple
92
+ end.join("\n")
93
+ end
94
+
95
+ end
96
+ end
97
+ >
98
+ #
99
+ #
100
+ module Rdf
101
+ def to_rdf
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,37 @@
1
+ module Wukong
2
+ #
3
+ # Export model's structure for other data frameworks:
4
+ # SQL and Pig
5
+ #
6
+ module Schema
7
+ def to_sql
8
+ end
9
+
10
+
11
+ # Export schema as Pig
12
+ def to_pig
13
+ members.zip(mtypes).map do |member, type|
14
+ member.to_s + ': ' + type.to_pig
15
+ end.join(', ')
16
+ end
17
+
18
+ def pig_klass
19
+ self.to_s.gsub(/.*::/, '')
20
+ end
21
+
22
+ def pig_load filename=nil
23
+ cmd = [
24
+ "%-23s" % pig_klass,
25
+ "= LOAD", filename || pig_klass.underscore.pluralize,
26
+ "AS ( rsrc:chararray,", self.to_pig, ')',
27
+ ].join(" ")
28
+ end
29
+ end
30
+ end
31
+
32
+ class << Integer ; def to_pig() 'int' end ; end
33
+ class << Bignum ; def to_pig() 'long' end ; end
34
+ class << Float ; def to_pig() 'float' end ; end
35
+ class << String ; def to_pig() 'chararray' end ; end
36
+ class << Symbol ; def to_pig() self end ; end
37
+ class << Date ; def to_pig() 'long' end ; end
@@ -0,0 +1,265 @@
1
+ require 'pathname'
2
+ require 'wukong/script/hadoop_command'
3
+ require 'wukong/script/local_command'
4
+ require 'rbconfig'
5
+ module Wukong
6
+
7
+ # == How to run a Wukong script
8
+ #
9
+ # your/script.rb --run path/to/input_files path/to/output_dir
10
+ #
11
+ # All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
12
+ #
13
+ # == Command-line options
14
+ #
15
+ # If you'd like to listen for any command-line options, specify them at the
16
+ # command line:
17
+ #
18
+ # your/script.rb --my_bool_opt --my_val_taking_opt=val \
19
+ # --run path/to/input_files path/to/output_dir
20
+ #
21
+ # In this case the options hash for both Mapper and Reducer will contain
22
+ #
23
+ # :my_bool_opt => true,
24
+ # :my_val_taking_opt => 'val'
25
+ #
26
+ # == Complicated input paths
27
+ #
28
+ # To use more than one file as input, you can use normal * ? [] wildcards or
29
+ # give a comma-separated list -- see the hadoop documentation for syntax.
30
+ #
31
+ # == Run locally (--run=local)
32
+ #
33
+ # To run your script locally, use --run=local
34
+ #
35
+ # your/script.rb --run=local path/to/input_files path/to/output_dir
36
+ #
37
+ # This will pipe the contents of path/to/input_files through first your
38
+ # mapper, then sort, then the reducer, storing the results in the given output
39
+ # directory.
40
+ #
41
+ # All paths refer to the /local/ filesystem -- hadoop is never involved and in
42
+ # fact doesn't even have to be installed.
43
+ #
44
+ # == How to test your scripts
45
+ #
46
+ # You can supply the --map argument in place of --run to run the mapper on its
47
+ # own (and similarly, --reduce to run the reducer standalone):
48
+ #
49
+ # cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
50
+ #
51
+ # or, if your test data lies on the HDFS,
52
+ #
53
+ # hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
54
+ #
55
+ #
56
+ class Script
57
+ include Wukong::HadoopCommand
58
+ include Wukong::LocalCommand
59
+ attr_accessor :mapper_klass, :reducer_klass, :options
60
+
61
+ #
62
+ # Instantiate the Script with the Mapper and the Reducer class (each a
63
+ # Wukong::Streamer) it should call back.
64
+ #
65
+ #
66
+ # == Identity or External program as map or reduce
67
+ #
68
+ # To use the identity reducer ('cat'), instantiate your Script class with
69
+ # +nil+ as the reducer class. (And similarly to use an identity mapper,
70
+ # supply +nil+ for the mapper class.)
71
+ #
72
+ # To use an external program as your reducer (mapper), subclass the
73
+ # reduce_command (map_command) method to return the full command line
74
+ # expression to call.
75
+ #
76
+ # class MyMapper < Wukong::Streamer::Base
77
+ # # ... awesome stuff ...
78
+ # end
79
+ #
80
+ # class MyScript < Wukong::Script
81
+ # # prefix each unique line with the count of its occurrences.
82
+ # def reduce_command
83
+ # '/usr/bin/uniq -c'
84
+ # end
85
+ # end
86
+ # MyScript.new(MyMapper, nil).run
87
+ #
88
+ def initialize mapper_klass, reducer_klass, extra_options={}
89
+ self.options = default_options.merge(extra_options)
90
+ process_argv!
91
+ self.mapper_klass = mapper_klass
92
+ self.reducer_klass = reducer_klass
93
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
94
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
95
+ end
96
+
97
+ #
98
+ # Gives default options. Command line parameters take precedence
99
+ #
100
+ # MAKE SURE YOU CALL SUPER: write your script according to the patter
101
+ #
102
+ # super.merge :my_option => :val
103
+ #
104
+ def default_options
105
+ Wukong::CONFIG[:runner_defaults] || {}
106
+ end
107
+
108
+ # Options that don't need to go in the :all_args hash
109
+ def std_options
110
+ @std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
111
+ end
112
+
113
+ #
114
+ # Parse the command-line args into the options hash.
115
+ #
116
+ # I should not reinvent the wheel.
117
+ # Yet: here we are.
118
+ #
119
+ # '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
120
+ # '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
121
+ #
122
+ # options[:all_args] contains all arguments that are not in std_options
123
+ # options[:rest] contains all arguments following the first non-flag (or the '--')
124
+ #
125
+ def process_argv!
126
+ options[:all_args] = []
127
+ args = ARGV.dup
128
+ while args do
129
+ arg = args.shift
130
+ case
131
+ when arg == '--'
132
+ break
133
+ when arg =~ /\A--(\w+)(?:=(.+))?\z/
134
+ opt, val = [$1, $2]
135
+ opt = opt.to_sym
136
+ val ||= true
137
+ self.options[opt] = val
138
+ options[:all_args] << arg unless std_options.include?(opt)
139
+ else
140
+ args.unshift(arg) ; break
141
+ end
142
+ end
143
+ options[:all_args] = options[:all_args].join(" ")
144
+ options[:rest] = args
145
+ end
146
+
147
+ def this_script_filename
148
+ Pathname.new($0).realpath
149
+ end
150
+
151
+ def ruby_interpreter_path
152
+ Pathname.new(
153
+ File.join(Config::CONFIG["bindir"],
154
+ Config::CONFIG["RUBY_INSTALL_NAME"]+
155
+ Config::CONFIG["EXEEXT"])
156
+ ).realpath
157
+ end
158
+
159
+ #
160
+ # by default, call this script in --map mode
161
+ #
162
+ def map_command
163
+ case
164
+ when mapper_klass
165
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
166
+ else options[:map_command] || Wukong::CONFIG[:default_mapper] end
167
+ end
168
+
169
+ #
170
+ # Shell command for reduce phase
171
+ # by default, call this script in --reduce mode
172
+ #
173
+ def reduce_command
174
+ case
175
+ when reducer_klass
176
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
177
+ else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
178
+ end
179
+
180
+ #
181
+ # Shell command to re-run in mapreduce mode using --map and --reduce
182
+ #
183
+ def runner_command input_path, output_path
184
+ # run as either local or hadoop
185
+ case run_mode
186
+ when 'local'
187
+ $stderr.puts " Reading STDIN / Writing STDOUT"
188
+ command = local_command input_path, output_path
189
+ when 'hadoop', 'mapred'
190
+ $stderr.puts " Launching hadoop as"
191
+ command = hadoop_command input_path, output_path
192
+ else
193
+ raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
194
+ end
195
+ end
196
+
197
+ def run_mode
198
+ # if only --run is given, assume default run mode
199
+ options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
200
+ options[:run].to_s
201
+ end
202
+
203
+ def input_output_paths
204
+ # input / output paths
205
+ input_path, output_path = options[:rest][0..1]
206
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:fake]) && (input_path.blank? || output_path.blank?)
207
+ [input_path, output_path]
208
+ end
209
+
210
+ def maybe_overwrite_output_paths! output_path
211
+ if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
212
+ $stderr.puts "Removing output file #{output_path}"
213
+ `hdp-rm -r '#{output_path}'`
214
+ end
215
+ end
216
+
217
+ #
218
+ # Execute the runner phase
219
+ #
220
+ def exec_hadoop_streaming
221
+ $stderr.puts "Streaming on self"
222
+ input_path, output_path = input_output_paths
223
+ maybe_overwrite_output_paths! output_path
224
+ command = runner_command(input_path, output_path)
225
+ $stderr.puts command
226
+ if ! options[:fake]
227
+ $stdout.puts `#{command}`
228
+ end
229
+ end
230
+
231
+ #
232
+ # If --map or --reduce, dispatch to the mapper or reducer.
233
+ # Otherwise,
234
+ #
235
+ def run
236
+ case
237
+ when options[:map]
238
+ mapper_klass.new(self.options).stream
239
+ when options[:reduce]
240
+ reducer_klass.new(self.options).stream
241
+ when options[:run]
242
+ exec_hadoop_streaming
243
+ else
244
+ self.help # Normant Vincent Peale is proud of you
245
+ end
246
+ end
247
+
248
+ #
249
+ # Command line usage
250
+ #
251
+ def help
252
+ $stderr.puts "#{self.class} script"
253
+ $stderr.puts %Q{
254
+ #{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
255
+ #{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
256
+ #{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
257
+ #{$0} --map
258
+ #{$0} --reduce # dispatch to the mapper or reducer
259
+
260
+ You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
261
+ }
262
+ end
263
+ end
264
+
265
+ end