mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,24 @@
1
+ module Wukong
2
+ RESOURCE_CLASS_MAP = { }
3
+
4
+ #
5
+ # Find the class from its underscored name. Note the klass is non-modularized.
6
+ # You can also pre-seed RESOURCE_CLASS_MAP
7
+ #
8
+ def self.class_from_resource rsrc
9
+ # This method has been profiled, so don't go making it more elegant unless you're doing same.
10
+ klass_name = rsrc.to_s
11
+ return RESOURCE_CLASS_MAP[klass_name] if RESOURCE_CLASS_MAP.include?(klass_name)
12
+ # kill off all but the non-modularized class name and camelize
13
+ klass_name.gsub!(/(?:^|_)(.)/){ $1.upcase }
14
+ begin
15
+ # convert it to class name
16
+ klass = klass_name.constantize
17
+ rescue Exception => e
18
+ warn "Bogus class name '#{klass_name}'? #{e}"
19
+ klass = nil
20
+ end
21
+ RESOURCE_CLASS_MAP[klass_name] = klass
22
+ end
23
+
24
+ end
@@ -0,0 +1,123 @@
1
+ module Wukong
2
+ module Datatypes
3
+ #
4
+ # Infinity is bigger than any number
5
+ #
6
+ #
7
+ Infinity = 1.0/0
8
+
9
+
10
+ #
11
+ # A simple enumerated class
12
+ #
13
+ # class MyEnum < Enum
14
+ # enumerates :firefox, :safari, :ie, :chrome, :other
15
+ # end
16
+ # MyEnum[1].to_s # => "safari"
17
+ #
18
+ #
19
+ class Enum
20
+ attr_accessor :val
21
+ class_inheritable_accessor :names
22
+ def initialize val
23
+ self.val = val
24
+ end
25
+ # MyEnum[val] is sugar for MyEnum.new(val)
26
+ def self.[] *args
27
+ new *args
28
+ end
29
+ def to_i
30
+ val
31
+ end
32
+ def to_s
33
+ return nil if val.nil?
34
+ self.class.names[val]
35
+ end
36
+ def inspect
37
+ "<#{self.class.to_s} #{to_i} (#{to_s})>"
38
+ end
39
+ # returns the value corresponding to that string representation
40
+ def index *args
41
+ # delegate
42
+ self.class.names.index *args
43
+ end
44
+ def to_flat
45
+ to_s #to_i
46
+ end
47
+
48
+ #
49
+ # Use enumerates to set the class' names
50
+ #
51
+ # class MyEnum < Enum
52
+ # enumerates :firefox, :safari, :ie, :chrome, :other
53
+ # end
54
+ # MyEnum[1].to_s # => "safari"
55
+ #
56
+ #
57
+ def self.enumerates *names
58
+ self.names = names.map(&:to_s)
59
+ end
60
+
61
+ def self.to_sql_str
62
+ "ENUM('#{names.join("', '")}')"
63
+ end
64
+
65
+ def self.typify
66
+ 'chararray'
67
+ end
68
+ end
69
+
70
+
71
+ #
72
+ # Note that bin 0 is
73
+ #
74
+ class Binned < Enum
75
+ class_inheritable_reader :bins, :empty_bin_name
76
+ @@empty_bin_name = '(none)'
77
+
78
+ def bins
79
+ self.class.bins
80
+ end
81
+
82
+ # FIXME -- doesn't respect a lower bound.
83
+ def initialize val
84
+ return super(val) if val.nil?
85
+ last_top = bins.first
86
+ bins.each_with_index do |bin_top, idx|
87
+ return super(idx) if val <= bin_top
88
+ end
89
+ return super(bins.length)
90
+ end
91
+
92
+ def self.enumerates *bins
93
+ options = bins.extract_options!
94
+ write_inheritable_attribute :bins, bins
95
+ last_top = bins.shift
96
+ # bins.unshift bins.first if last_top == -Infinity
97
+ names = bins.map do |bin_top|
98
+ name = bin_name last_top, bin_top, options
99
+ last_top = (last_top.is_a?(Integer) ? bin_top + 1 : bin_top)
100
+ name
101
+ end
102
+ super(*names)
103
+ end
104
+
105
+ #
106
+ # Bins
107
+ #
108
+ def self.bin_name lo_val, hi_val, options = { }
109
+ # case lo_val
110
+ # when Integer then lo_val = [lo_val+1, hi_val].compact.min
111
+ # end
112
+ case
113
+ when lo_val == -Infinity then "< #{hi_val}"
114
+ when hi_val == Infinity then "#{lo_val}+"
115
+ when (lo_val == hi_val) then lo_val
116
+ else "#{lo_val} - #{hi_val}"
117
+ end
118
+ end
119
+
120
+ end
121
+ end
122
+ end
123
+
@@ -0,0 +1,80 @@
1
+ require 'time' # ain't it always that way
2
+ module Wukong
3
+ module Dfs
4
+ def self.list_files dfs_path
5
+ Wukong.logger.info{ "DFS: listing #{dfs_path}" }
6
+ listing = `hadoop dfs -ls #{dfs_path}`.split("\n").reject{|ls_line| ls_line =~ /Found \d+ items/i}
7
+ listing.map{|ls_line| HFile.new_from_ls(ls_line)}
8
+ end
9
+
10
+ #
11
+ # FIXME -- this will fail if multiple files in a listing have the
12
+ # same basename. Sorry.
13
+ #
14
+ def self.compare_listings src_files, dest_files, &block
15
+ src_files.sort.each do |src_file|
16
+ dest_file = dest_files.find{|df| File.basename(src_file) == df.basename }
17
+ case
18
+ when (! dest_file) then yield :missing, src_file, nil
19
+ when (! dest_file.kinda_equal(src_file)) then yield :differ, src_file, dest_file
20
+ else yield :same, src_file, dest_file
21
+ end
22
+ end
23
+ end
24
+
25
+ class HFile < TypedStruct.new(
26
+ [:mode_str, String],
27
+ [:i_count, String],
28
+ [:owner, String],
29
+ [:group, String],
30
+ [:size, Integer],
31
+ [:date, Bignum],
32
+ [:path, String]
33
+ )
34
+ def self.new_from_ls ls_line
35
+ mode, ic, o, g, sz, dt, tm, path = ls_line.chomp.split(/\s+/)
36
+ date = Time.parse("#{dt} #{tm}").utc.to_flat
37
+ new mode, ic.to_i, o, g, sz.to_i, date, path
38
+ end
39
+ def dirname
40
+ @dirname ||= File.dirname(path)
41
+ end
42
+ def basename
43
+ @basename ||= File.basename(path)
44
+ end
45
+ #
46
+ # Two files are kinda_equal if they match in size and if
47
+ # the hdfs version is later than the filesystem version.
48
+ #
49
+ def kinda_equal file
50
+ (self.size == File.size(file)) # && (self.date >= File.mtime(file).utc.to_flat)
51
+ end
52
+ def to_s
53
+ to_a.join("\t")
54
+ end
55
+
56
+ #
57
+ # These will be very slow.
58
+ # If some kind soul will integrate JRuby callouts the bards shall
59
+ # celebrate your name evermore.
60
+ #
61
+
62
+ # rename the file on the HDFS
63
+ def mv new_filename
64
+ self.class.run_dfs_command :mv, path, new_filename
65
+ end
66
+
67
+ def self.mkdir dirname
68
+ run_dfs_command :mkdir, dirname
69
+ end
70
+ def self.mkdir_p(*args) self.mkdir *args ; end # HDFS is always -p
71
+
72
+ def self.run_dfs_command *args
73
+ cmd = 'hadoop dfs -'+ args.flatten.compact.join(" ")
74
+ Wukong.logger.debug{ "DFS: Running #{cmd}" }
75
+ Wukong.logger.info{ `#{cmd} 2>&1`.gsub(/[\r\n\t]+/, " ") }
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,111 @@
1
+ require 'htmlentities'
2
+ require 'addressable/uri'
3
+
4
+ module Wukong
5
+ #
6
+ # By default (or explicitly with the :xml strategy), convert string to
7
+ # * XML-encoded ASCII,
8
+ #
9
+ # * with a guarantee that the characters " quote, ' apos \\ backslash,
10
+ # carriage-return \r newline \n and tab \t (as well as all other control
11
+ # characters) are encoded.
12
+ #
13
+ # * Any XML-encoding in the original text is encoded with no introspection:
14
+ # encode_str("&lt;a href=\"foo\"&gt;")
15
+ # # => "&amp;lt;a href=&quot;foo&quot;&amp;gt;"
16
+ #
17
+ # * Useful: http://rishida.net/scripts/uniview/conversion.php
18
+ #
19
+ # With the :url strategy,
20
+ # * URL-encode the string
21
+ # * This is as strict as possible: encodes all but alphanumeric and _ underscore.
22
+ # The resulting string is thus XML- and URL-safe.
23
+ # http://addressable.rubyforge.org/api/classes/Addressable/URI.html#M000010
24
+ #
25
+ # Wukong.decode_str(Wukong.encode_str(str)) returns the original str
26
+ #
27
+ #
28
+ #
29
+ def self.encode_str str, strategy=:xml
30
+ begin
31
+ case strategy
32
+ when :xml then self.html_encoder.encode(str, :basic, :named, :decimal).gsub(/\\/, '&#x5C;')
33
+ when :url then Addressable::URI.encode_component(str, /[^\w]/)
34
+ else raise "Don't know how to encode with strategy #{strategy}"
35
+ end
36
+ rescue ArgumentError => e
37
+ str.gsub!(/[^\w\s\.\-@#%]+/, '')
38
+ '!!bad_encoding!! ' + str
39
+ end
40
+ end
41
+ # HTMLEntities encoder instance
42
+ def self.html_encoder
43
+ @html_encoder ||= HTMLEntities.new
44
+ end
45
+
46
+ #
47
+ # Decode string from its encode_str representation. This can include
48
+ # dangerous things such as tabs, newlines, backslashes and cryptofascist
49
+ # propaganda.
50
+ #
51
+ def self.decode_str str, strategy=:xml
52
+ case strategy
53
+ when :xml then HTMLEntities.decode_entities(str)
54
+ when :url then Addressable::URI.unencode_component(str)
55
+ else raise "Don't know how to decode with strategy #{strategy}"
56
+ end
57
+ end
58
+
59
+ #
60
+ # Replace each given field in the hash with its
61
+ # encoded value
62
+ #
63
+ def self.encode_components hsh, *fields
64
+ fields.each do |field|
65
+ hsh[field] = hsh[field].to_s.wukong_encode if hsh[field]
66
+ end
67
+ end
68
+ end
69
+
70
+ String.class_eval do
71
+
72
+ #
73
+ # Strip control characters that might harsh our buzz, TSV-wise
74
+ # See Wukong.encode_str
75
+ #
76
+ def wukong_encode!
77
+ replace self.wukong_encode
78
+ end
79
+
80
+ def wukong_encode
81
+ Wukong.encode_str(self)
82
+ end
83
+
84
+ #
85
+ # Decode string into original (and possibly unsafe) form
86
+ # See Wukong.encode_str and Wukong.decode_str
87
+ #
88
+ def wukong_decode!
89
+ replace self.wukong_decode
90
+ end
91
+
92
+ def wukong_decode
93
+ Wukong.decode_str(self)
94
+ end
95
+
96
+ #
97
+ # Takes an XML-encoded or plaintext string and forces it into canonical encoding
98
+ #
99
+ def wukong_recode!
100
+ replace self.wukong_decode.wukong_encode
101
+ end
102
+ def wukong_recode
103
+ Wukong.encode_str(Wukong.decode_str(self))
104
+ end
105
+ end
106
+
107
+ Struct.class_eval do
108
+ def recode!
109
+ each_pair{|k,v| v.wukong_recode! if (v && v.respond_to?(:wukong_recode!)) }
110
+ end
111
+ end
@@ -0,0 +1,15 @@
1
+ #
2
+ # These pull in the minimal functionality of the extlib|activesupport family of
3
+ # gems.
4
+ #
5
+ require 'wukong/extensions/blank'
6
+ require 'wukong/extensions/class'
7
+ require 'wukong/extensions/symbol'
8
+ require 'wukong/extensions/hash'
9
+ require 'wukong/extensions/hash_like'
10
+ require 'wukong/extensions/array'
11
+ require 'wukong/extensions/struct'
12
+ require 'wukong/extensions/module'
13
+ require 'wukong/extensions/string'
14
+ require 'wukong/extensions/date_time'
15
+ require 'wukong/extensions/emittable'
@@ -0,0 +1,18 @@
1
+ class Array
2
+ #
3
+ # The following is taken in whole from the extlib gem. Thanks y'all.
4
+ #
5
+
6
+ # Extracts options from a set of arguments. Removes and returns the last
7
+ # element in the array if it's a hash, otherwise returns a blank hash.
8
+ #
9
+ # def options(*args)
10
+ # args.extract_options!
11
+ # end
12
+ #
13
+ # options(1, 2) # => {}
14
+ # options(1, 2, :a => :b) # => {:a=>:b}
15
+ def extract_options!
16
+ last.is_a?(::Hash) ? pop : {}
17
+ end
18
+ end
@@ -0,0 +1,93 @@
1
+ #
2
+ # This is taken in whole from the extlib gem. Thanks y'all.
3
+ #
4
+
5
+ class Object
6
+ ##
7
+ # Returns true if the object is nil or empty (if applicable)
8
+ #
9
+ # [].blank? #=> true
10
+ # [1].blank? #=> false
11
+ # [nil].blank? #=> false
12
+ #
13
+ # @return [TrueClass, FalseClass]
14
+ #
15
+ # @api public
16
+ def blank?
17
+ nil? || (respond_to?(:empty?) && empty?)
18
+ end
19
+ end # class Object
20
+
21
+ class Numeric
22
+ ##
23
+ # Numerics are never blank
24
+ #
25
+ # 0.blank? #=> false
26
+ # 1.blank? #=> false
27
+ # 6.54321.blank? #=> false
28
+ #
29
+ # @return [FalseClass]
30
+ #
31
+ # @api public
32
+ def blank?
33
+ false
34
+ end
35
+ end # class Numeric
36
+
37
+ class NilClass
38
+ ##
39
+ # Nil is always blank
40
+ #
41
+ # nil.blank? #=> true
42
+ #
43
+ # @return [TrueClass]
44
+ #
45
+ # @api public
46
+ def blank?
47
+ true
48
+ end
49
+ end # class NilClass
50
+
51
+ class TrueClass
52
+ ##
53
+ # True is never blank.
54
+ #
55
+ # true.blank? #=> false
56
+ #
57
+ # @return [FalseClass]
58
+ #
59
+ # @api public
60
+ def blank?
61
+ false
62
+ end
63
+ end # class TrueClass
64
+
65
+ class FalseClass
66
+ ##
67
+ # False is always blank.
68
+ #
69
+ # false.blank? #=> true
70
+ #
71
+ # @return [TrueClass]
72
+ #
73
+ # @api public
74
+ def blank?
75
+ true
76
+ end
77
+ end # class FalseClass
78
+
79
+ class String
80
+ ##
81
+ # Strips out whitespace then tests if the string is empty.
82
+ #
83
+ # "".blank? #=> true
84
+ # " ".blank? #=> true
85
+ # " hey ho ".blank? #=> false
86
+ #
87
+ # @return [TrueClass, FalseClass]
88
+ #
89
+ # @api public
90
+ def blank?
91
+ strip.empty?
92
+ end
93
+ end # class String