wukong 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,47 @@
1
+ require 'wukong/extensions/class'
2
+ module Wukong
3
+
4
+ module HashlikeClass
5
+ module ClassMethods
6
+ def has_members *members
7
+ self.members ||= []
8
+ self.members = members.map(&:to_s) + self.members
9
+ self.members.each do |member|
10
+ attr_accessor member.to_sym
11
+ end
12
+ end
13
+ alias_method :has_member, :has_members
14
+ def keys
15
+ members
16
+ end
17
+ end
18
+
19
+ def [](key)
20
+ self.send(key)
21
+ end
22
+
23
+ def []=(key, val)
24
+ self.send("#{key}=", val)
25
+ end
26
+
27
+ def to_a
28
+ values_of(*members)
29
+ end
30
+
31
+ def to_flat
32
+ to_a.map(&:to_flat).flatten
33
+ end
34
+
35
+ def self.included base
36
+ base.class_eval do
37
+ extend ClassMethods
38
+ include HashLike
39
+ class_inheritable_accessor :members
40
+
41
+ def to_hash *args
42
+ super(*args).merge 'type' => self.class.to_s
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,2 @@
1
+ require 'extlib/object'
2
+ require 'extlib/module'
@@ -0,0 +1,27 @@
1
+ require 'pathname'
2
+ class Pathname
3
+ # Append path segments and expand to absolute path
4
+ #
5
+ # file = Pathname(Dir.pwd) / "subdir1" / :subdir2 / "filename.ext"
6
+ #
7
+ # @param [Pathname, String, #to_s] path path segment to concatenate with receiver
8
+ #
9
+ # @return [Pathname]
10
+ # receiver with _path_ appended and expanded to an absolute path
11
+ #
12
+ # @api public
13
+ def /(path)
14
+ (self + path).expand_path
15
+ end
16
+
17
+ def self.[](*vals)
18
+ new( File.join(vals) )
19
+ end
20
+ end
21
+
22
+ class Subdir < Pathname
23
+ def self.[](*vals)
24
+ dir = File.dirname(vals.shift)
25
+ new(File.join(dir, *vals))
26
+ end
27
+ end
@@ -0,0 +1,65 @@
1
+ #
2
+ # String Monkeypatched for processing with wukong: see wukong/extensions/string
3
+ #
4
+ String.class_eval do
5
+ # By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
6
+ # is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
7
+ #
8
+ # +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
9
+ #
10
+ # Examples:
11
+ # "active_record".camelize # => "ActiveRecord"
12
+ # "active_record".camelize(:lower) # => "activeRecord"
13
+ # "active_record/errors".camelize # => "ActiveRecord::Errors"
14
+ # "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
15
+ def camelize(first_letter_in_uppercase = true)
16
+ if first_letter_in_uppercase
17
+ self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
18
+ else
19
+ self.first + camelize(self)[1..-1]
20
+ end
21
+ end
22
+
23
+ #
24
+ # The reverse of +camelize+. Makes an underscored, lowercase form from the expression in the string.
25
+ #
26
+ # Changes '::' to '/' to convert namespaces to paths.
27
+ #
28
+ # Examples:
29
+ # "ActiveRecord".underscore # => "active_record"
30
+ # "ActiveRecord::Errors".underscore # => active_record/errors
31
+ #
32
+ # Stolen from active_support
33
+ #
34
+ def underscore
35
+ gsub(/::/, '/').
36
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
37
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
38
+ tr("-", "_").
39
+ downcase
40
+ end
41
+
42
+ # Tries to find a constant with the name specified in the argument string:
43
+ #
44
+ # "Module".constantize # => Module
45
+ # "Test::Unit".constantize # => Test::Unit
46
+ #
47
+ # The name is assumed to be the one of a top-level constant, no matter whether
48
+ # it starts with "::" or not. No lexical context is taken into account:
49
+ #
50
+ # C = 'outside'
51
+ # module M
52
+ # C = 'inside'
53
+ # C # => 'inside'
54
+ # "C".constantize # => 'outside', same as ::C
55
+ # end
56
+ #
57
+ # NameError is raised when the name is not in CamelCase or the constant is
58
+ # unknown.
59
+ def constantize
60
+ unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ self
61
+ raise NameError, "#{self.inspect} is not a valid constant name!"
62
+ end
63
+ Object.module_eval("::#{$1}", __FILE__, __LINE__)
64
+ end
65
+ end
@@ -0,0 +1,17 @@
1
+ require 'wukong/extensions/hash'
2
+ require 'wukong/extensions/hash_like'
3
+ require 'wukong/extensions/symbol'
4
+
5
+ #
6
+ # extensions/struct
7
+ #
8
+ # Add several methods to make a struct duck-type much more like a Hash
9
+ #
10
+ Struct.class_eval do
11
+ include Wukong::HashLike
12
+ def self.keys
13
+ members
14
+ end
15
+ end
16
+
17
+
@@ -0,0 +1,11 @@
1
+ #
2
+ # h2. extensions/symbol.rb -- extensions to symbol class
3
+ #
4
+ class Symbol
5
+ #
6
+ # Turn the symbol into a simple proc (stolen from
7
+ # <tt>ActiveSupport::CoreExtensions::Symbol</tt>).
8
+ def to_proc
9
+ Proc.new { |*args| args.shift.__send__(self, *args) }
10
+ end
11
+ end
@@ -0,0 +1,53 @@
1
+ module Wukong
2
+ # Common logger
3
+ #
4
+ # Set your own at any time with
5
+ # Wukong.logger = YourAwesomeLogger.new(...)
6
+ # If you have log4r installed you can use
7
+ # Wukong.logger = Wukong.default_log4r_logger
8
+ #
9
+ # If Wukong.logger is too much typing for you,
10
+ # use the Log constant
11
+ #
12
+ # Default format:
13
+ # I, [2009-07-26T19:58:46-05:00 #12332]: Up to 2000 char message
14
+ #
15
+ def self.logger
16
+ @logger ||= default_ruby_logger
17
+ end
18
+
19
+ #
20
+ # Log4r logger, set up to produce tab-delimited (and thus, wukong|hadoop
21
+ # friendly) output lines
22
+ #
23
+ def self.default_log4r_logger logger_handle='wukong'
24
+ require 'log4r'
25
+ lgr = Log4r::Logger.new logger_handle
26
+ outputter = Log4r::Outputter.stderr
27
+ # Define timestamp formatter method
28
+ ::Time.class_eval do def utc_iso8601() utc.iso8601 ; end ; end
29
+ # 2009-07-25T00:12:05Z INFO PID\t
30
+ outputter.formatter = Log4r::PatternFormatter.new(
31
+ :pattern => "%d %.4l #{Process.pid}\t%.2000m",
32
+ :date_method => :utc_iso8601
33
+ )
34
+ lgr.outputters = outputter
35
+ lgr
36
+ end
37
+
38
+ def self.default_ruby_logger
39
+ require 'logger'
40
+ Logger.new STDERR
41
+ end
42
+
43
+ def self.logger= logger
44
+ @logger = logger
45
+ end
46
+ end
47
+
48
+ #
49
+ # A convenient logger.
50
+ #
51
+ # Define NO_WUKONG_LOG (or define Log yourself) to prevent its creation
52
+ #
53
+ Log = Wukong.logger unless (defined?(Log) || defined?(NO_WUKONG_LOG))
@@ -0,0 +1,27 @@
1
+
2
+ module Wukong
3
+ module Models
4
+ class Edge < TypedStruct.new(
5
+ [:src, Integer],
6
+ [:dest, Integer]
7
+ )
8
+ end
9
+
10
+ class MultiEdge < TypedStruct.new(
11
+ [:src, Integer],
12
+ [:dest, Integer],
13
+ [:a_follows_b, Integer],
14
+ [:b_follows_a, Integer],
15
+ [:a_replies_b, Integer],
16
+ [:b_replies_a, Integer],
17
+ [:a_atsigns_b, Integer],
18
+ [:b_atsigns_a, Integer],
19
+ [:a_retweets_b, Integer],
20
+ [:b_retweets_a, Integer],
21
+ [:a_favorites_b, Integer],
22
+ [:b_favorites_a, Integer]
23
+ )
24
+ end
25
+
26
+ end
27
+ end
data/lib/wukong/rdf.rb ADDED
@@ -0,0 +1,104 @@
1
+ module Wukong
2
+ #
3
+ # Dump wukong object as RDF triples:
4
+ #
5
+ # <key attr val module Wukong
6
+ #
7
+ # Dump wukong object as RDF triples:
8
+ #
9
+ # <key> <attr> <val> # <extra>
10
+ #
11
+ # Each element of the triple is XML encoded such that it contains no tab,
12
+ # newline or carriage returns, and the three are tab-separated. Any extra
13
+ # fields -- reification info, for instance -- are appended as a comment.
14
+ #
15
+ # This makes the result not only a valid RDF triple file but perfectly
16
+ # palatable to Wukong for further processing.
17
+ #
18
+ module Rdf
19
+
20
+ #
21
+ # RDF-formatted date
22
+ #
23
+ def self.encode_datetime dt
24
+ DateTime.parse_safely(dt).xmlschema
25
+ end
26
+
27
+ #
28
+ # Emit a component (subject or object) with the right semantic encoding
29
+ #
30
+ # Use :boolskip if a false property should just be left out.
31
+ #
32
+ def rdf_component val, type
33
+ case type
34
+ when :tweet then %Q{<http://twitter.com/statuses/show/#{val}.xml>}
35
+ when :user then %Q{<http://twitter.com/users/show/#{val}.xml>}
36
+ when :bool then ((!val) || (val==0) || (val=="0")) ? '"false"^^<xsd:boolean>' : '"true"^^<xsd:boolean>'
37
+ when :boolskip then ((!val) || (val==0) || (val=="0")) ? nil : '"true"^^<xsd:boolean>'
38
+ when :int then %Q{"#{val.to_i}"^^<xsd:integer>}
39
+ when :date then %Q{"#{TwitterRdf.encode_datetime(val)}"^^<xsd:dateTime>}
40
+ when :str then %Q{"#{val}"}
41
+ else raise "Don't know how to encode #{type}"
42
+ end
43
+ end
44
+
45
+ #
46
+ # Express relationship (predicate) in RDF
47
+ #
48
+ def rdf_pred pred
49
+ case pred
50
+ when :created_at then %Q{<http://twitter.com/##{pred}>}
51
+ else %Q{<http://twitter.com/##{pred}>}
52
+ end
53
+ end
54
+
55
+ #
56
+ # RDF Triple string for the given (subject, object, predicate)
57
+ # http://www.w3.org/TR/rdf-testcases/#ntriples
58
+ #
59
+ def self.rdf_triple subj, pred, obj, comment=nil
60
+ comment = "\t# " + comment.to_s unless comment.blank?
61
+ %Q{%-55s\t%-39s\t%-23s\t.%s} % [subj, pred, obj, comment]
62
+ end
63
+
64
+ def mutable?(attr)
65
+ false
66
+ end
67
+
68
+ #
69
+ # Extract [subject, predicate, object, (extra)] tuples.
70
+ #
71
+ # (extra) is set to +scraped at+ for #mutable? attributes, blank otherwise.
72
+ #
73
+ def to_rdf3_tuples
74
+ members_with_types.map do |attr, type|
75
+ next if self[attr].blank?
76
+ subj = rdf_resource
77
+ pred = rdf_pred(attr)
78
+ obj = rdf_component(self[attr], type) or next
79
+ comment = scraped_at if mutable?(attr)
80
+ [subj, pred, obj, comment]
81
+ end.compact
82
+ end
83
+
84
+ #
85
+ # Convert an object to an rdf triple.
86
+ #
87
+ # Appends scraped at to #mutable? attributes
88
+ #
89
+ def to_rdf3
90
+ to_rdf3_tuples.map do |tuple|
91
+ self.class.rdf_triple tuple
92
+ end.join("\n")
93
+ end
94
+
95
+ end
96
+ end
97
+ >
98
+ #
99
+ #
100
+ module Rdf
101
+ def to_rdf
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,37 @@
1
+ module Wukong
2
+ #
3
+ # Export model's structure for other data frameworks:
4
+ # SQL and Pig
5
+ #
6
+ module Schema
7
+ def to_sql
8
+ end
9
+
10
+
11
+ # Export schema as Pig
12
+ def to_pig
13
+ members.zip(mtypes).map do |member, type|
14
+ member.to_s + ': ' + type.to_pig
15
+ end.join(', ')
16
+ end
17
+
18
+ def pig_klass
19
+ self.to_s.gsub(/.*::/, '')
20
+ end
21
+
22
+ def pig_load filename=nil
23
+ cmd = [
24
+ "%-23s" % pig_klass,
25
+ "= LOAD", filename || pig_klass.underscore.pluralize,
26
+ "AS ( rsrc:chararray,", self.to_pig, ')',
27
+ ].join(" ")
28
+ end
29
+ end
30
+ end
31
+
32
+ class << Integer ; def to_pig() 'int' end ; end
33
+ class << Bignum ; def to_pig() 'long' end ; end
34
+ class << Float ; def to_pig() 'float' end ; end
35
+ class << String ; def to_pig() 'chararray' end ; end
36
+ class << Symbol ; def to_pig() self end ; end
37
+ class << Date ; def to_pig() 'long' end ; end
@@ -0,0 +1,265 @@
1
+ require 'pathname'
2
+ require 'wukong/script/hadoop_command'
3
+ require 'wukong/script/local_command'
4
+ require 'rbconfig'
5
+ module Wukong
6
+
7
+ # == How to run a Wukong script
8
+ #
9
+ # your/script.rb --run path/to/input_files path/to/output_dir
10
+ #
11
+ # All of the file paths are HDFS paths ; your script path, of course, is on the local filesystem.
12
+ #
13
+ # == Command-line options
14
+ #
15
+ # If you'd like to listen for any command-line options, specify them at the
16
+ # command line:
17
+ #
18
+ # your/script.rb --my_bool_opt --my_val_taking_opt=val \
19
+ # --run path/to/input_files path/to/output_dir
20
+ #
21
+ # In this case the options hash for both Mapper and Reducer will contain
22
+ #
23
+ # :my_bool_opt => true,
24
+ # :my_val_taking_opt => 'val'
25
+ #
26
+ # == Complicated input paths
27
+ #
28
+ # To use more than one file as input, you can use normal * ? [] wildcards or
29
+ # give a comma-separated list -- see the hadoop documentation for syntax.
30
+ #
31
+ # == Run locally (--run=local)
32
+ #
33
+ # To run your script locally, use --run=local
34
+ #
35
+ # your/script.rb --run=local path/to/input_files path/to/output_dir
36
+ #
37
+ # This will pipe the contents of path/to/input_files through first your
38
+ # mapper, then sort, then the reducer, storing the results in the given output
39
+ # directory.
40
+ #
41
+ # All paths refer to the /local/ filesystem -- hadoop is never involved and in
42
+ # fact doesn't even have to be installed.
43
+ #
44
+ # == How to test your scripts
45
+ #
46
+ # You can supply the --map argument in place of --run to run the mapper on its
47
+ # own (and similarly, --reduce to run the reducer standalone):
48
+ #
49
+ # cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
50
+ #
51
+ # or, if your test data lies on the HDFS,
52
+ #
53
+ # hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
54
+ #
55
+ #
56
+ class Script
57
+ include Wukong::HadoopCommand
58
+ include Wukong::LocalCommand
59
+ attr_accessor :mapper_klass, :reducer_klass, :options
60
+
61
+ #
62
+ # Instantiate the Script with the Mapper and the Reducer class (each a
63
+ # Wukong::Streamer) it should call back.
64
+ #
65
+ #
66
+ # == Identity or External program as map or reduce
67
+ #
68
+ # To use the identity reducer ('cat'), instantiate your Script class with
69
+ # +nil+ as the reducer class. (And similarly to use an identity mapper,
70
+ # supply +nil+ for the mapper class.)
71
+ #
72
+ # To use an external program as your reducer (mapper), subclass the
73
+ # reduce_command (map_command) method to return the full command line
74
+ # expression to call.
75
+ #
76
+ # class MyMapper < Wukong::Streamer::Base
77
+ # # ... awesome stuff ...
78
+ # end
79
+ #
80
+ # class MyScript < Wukong::Script
81
+ # # prefix each unique line with the count of its occurrences.
82
+ # def reduce_command
83
+ # '/usr/bin/uniq -c'
84
+ # end
85
+ # end
86
+ # MyScript.new(MyMapper, nil).run
87
+ #
88
+ def initialize mapper_klass, reducer_klass, extra_options={}
89
+ self.options = default_options.merge(extra_options)
90
+ process_argv!
91
+ self.mapper_klass = mapper_klass
92
+ self.reducer_klass = reducer_klass
93
+ # If no reducer_klass and no reduce_command, then skip the reduce phase
94
+ options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
95
+ end
96
+
97
+ #
98
+ # Gives default options. Command line parameters take precedence
99
+ #
100
+ # MAKE SURE YOU CALL SUPER: write your script according to the patter
101
+ #
102
+ # super.merge :my_option => :val
103
+ #
104
+ def default_options
105
+ Wukong::CONFIG[:runner_defaults] || {}
106
+ end
107
+
108
+ # Options that don't need to go in the :all_args hash
109
+ def std_options
110
+ @std_options ||= [:run, :map, :reduce, ] + HADOOP_OPTIONS_MAP.keys
111
+ end
112
+
113
+ #
114
+ # Parse the command-line args into the options hash.
115
+ #
116
+ # I should not reinvent the wheel.
117
+ # Yet: here we are.
118
+ #
119
+ # '--foo=foo_val' produces :foo => 'foo_val' in the options hash.
120
+ # '--' After seeing a non-'--' flag, or a '--' on its own, no further flags are parsed
121
+ #
122
+ # options[:all_args] contains all arguments that are not in std_options
123
+ # options[:rest] contains all arguments following the first non-flag (or the '--')
124
+ #
125
+ def process_argv!
126
+ options[:all_args] = []
127
+ args = ARGV.dup
128
+ while args do
129
+ arg = args.shift
130
+ case
131
+ when arg == '--'
132
+ break
133
+ when arg =~ /\A--(\w+)(?:=(.+))?\z/
134
+ opt, val = [$1, $2]
135
+ opt = opt.to_sym
136
+ val ||= true
137
+ self.options[opt] = val
138
+ options[:all_args] << arg unless std_options.include?(opt)
139
+ else
140
+ args.unshift(arg) ; break
141
+ end
142
+ end
143
+ options[:all_args] = options[:all_args].join(" ")
144
+ options[:rest] = args
145
+ end
146
+
147
+ def this_script_filename
148
+ Pathname.new($0).realpath
149
+ end
150
+
151
+ def ruby_interpreter_path
152
+ Pathname.new(
153
+ File.join(Config::CONFIG["bindir"],
154
+ Config::CONFIG["RUBY_INSTALL_NAME"]+
155
+ Config::CONFIG["EXEEXT"])
156
+ ).realpath
157
+ end
158
+
159
+ #
160
+ # by default, call this script in --map mode
161
+ #
162
+ def map_command
163
+ case
164
+ when mapper_klass
165
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + options[:all_args]
166
+ else options[:map_command] || Wukong::CONFIG[:default_mapper] end
167
+ end
168
+
169
+ #
170
+ # Shell command for reduce phase
171
+ # by default, call this script in --reduce mode
172
+ #
173
+ def reduce_command
174
+ case
175
+ when reducer_klass
176
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + options[:all_args]
177
+ else options[:reduce_command] || Wukong::CONFIG[:default_reducer] end
178
+ end
179
+
180
+ #
181
+ # Shell command to re-run in mapreduce mode using --map and --reduce
182
+ #
183
+ def runner_command input_path, output_path
184
+ # run as either local or hadoop
185
+ case run_mode
186
+ when 'local'
187
+ $stderr.puts " Reading STDIN / Writing STDOUT"
188
+ command = local_command input_path, output_path
189
+ when 'hadoop', 'mapred'
190
+ $stderr.puts " Launching hadoop as"
191
+ command = hadoop_command input_path, output_path
192
+ else
193
+ raise "Need to use --run=local or --run=hadoop; or to use the :default_run_mode in config.yaml just say --run "
194
+ end
195
+ end
196
+
197
+ def run_mode
198
+ # if only --run is given, assume default run mode
199
+ options[:run] = Wukong::CONFIG[:default_run_mode] if (options[:run] == true)
200
+ options[:run].to_s
201
+ end
202
+
203
+ def input_output_paths
204
+ # input / output paths
205
+ input_path, output_path = options[:rest][0..1]
206
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:fake]) && (input_path.blank? || output_path.blank?)
207
+ [input_path, output_path]
208
+ end
209
+
210
+ def maybe_overwrite_output_paths! output_path
211
+ if (options[:overwrite] || options[:rm]) && (run_mode != 'local')
212
+ $stderr.puts "Removing output file #{output_path}"
213
+ `hdp-rm -r '#{output_path}'`
214
+ end
215
+ end
216
+
217
+ #
218
+ # Execute the runner phase
219
+ #
220
+ def exec_hadoop_streaming
221
+ $stderr.puts "Streaming on self"
222
+ input_path, output_path = input_output_paths
223
+ maybe_overwrite_output_paths! output_path
224
+ command = runner_command(input_path, output_path)
225
+ $stderr.puts command
226
+ if ! options[:fake]
227
+ $stdout.puts `#{command}`
228
+ end
229
+ end
230
+
231
+ #
232
+ # If --map or --reduce, dispatch to the mapper or reducer.
233
+ # Otherwise,
234
+ #
235
+ def run
236
+ case
237
+ when options[:map]
238
+ mapper_klass.new(self.options).stream
239
+ when options[:reduce]
240
+ reducer_klass.new(self.options).stream
241
+ when options[:run]
242
+ exec_hadoop_streaming
243
+ else
244
+ self.help # Normant Vincent Peale is proud of you
245
+ end
246
+ end
247
+
248
+ #
249
+ # Command line usage
250
+ #
251
+ def help
252
+ $stderr.puts "#{self.class} script"
253
+ $stderr.puts %Q{
254
+ #{$0} --run=hadoop input_hdfs_path output_hdfs_dir # run the script with hadoop streaming
255
+ #{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
256
+ #{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
257
+ #{$0} --map
258
+ #{$0} --reduce # dispatch to the mapper or reducer
259
+
260
+ You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
261
+ }
262
+ end
263
+ end
264
+
265
+ end