wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,24 @@
1
+ module Wukong
2
+ RESOURCE_CLASS_MAP = { }
3
+
4
+ #
5
+ # Find the class from its underscored name. Note the klass is non-modularized.
6
+ # You can also pre-seed RESOURCE_CLASS_MAP
7
+ #
8
+ def self.class_from_resource rsrc
9
+ # This method has been profiled, so don't go making it more elegant unless you're doing same.
10
+ klass_name = rsrc.to_s
11
+ return RESOURCE_CLASS_MAP[klass_name] if RESOURCE_CLASS_MAP.include?(klass_name)
12
+ # kill off all but the non-modularized class name and camelize
13
+ klass_name.gsub!(/(?:^|_)(.)/){ $1.upcase }
14
+ begin
15
+ # convert it to class name
16
+ klass = klass_name.constantize
17
+ rescue Exception => e
18
+ warn "Bogus class name '#{klass_name}'? #{e}"
19
+ klass = nil
20
+ end
21
+ RESOURCE_CLASS_MAP[klass_name] = klass
22
+ end
23
+
24
+ end
@@ -0,0 +1,123 @@
1
+ module Wukong
2
+ module Datatypes
3
+ #
4
+ # Infinity is bigger than any number
5
+ #
6
+ #
7
+ Infinity = 1.0/0
8
+
9
+
10
+ #
11
+ # A simple enumerated class
12
+ #
13
+ # class MyEnum < Enum
14
+ # enumerates :firefox, :safari, :ie, :chrome, :other
15
+ # end
16
+ # MyEnum[1].to_s # => "safari"
17
+ #
18
+ #
19
+ class Enum
20
+ attr_accessor :val
21
+ class_inheritable_accessor :names
22
+ def initialize val
23
+ self.val = val
24
+ end
25
+ # MyEnum[val] is sugar for MyEnum.new(val)
26
+ def self.[] *args
27
+ new *args
28
+ end
29
+ def to_i
30
+ val
31
+ end
32
+ def to_s
33
+ return nil if val.nil?
34
+ self.class.names[val]
35
+ end
36
+ def inspect
37
+ "<#{self.class.to_s} #{to_i} (#{to_s})>"
38
+ end
39
+ # returns the value corresponding to that string representation
40
+ def index *args
41
+ # delegate
42
+ self.class.names.index *args
43
+ end
44
+ def to_flat
45
+ to_s #to_i
46
+ end
47
+
48
+ #
49
+ # Use enumerates to set the class' names
50
+ #
51
+ # class MyEnum < Enum
52
+ # enumerates :firefox, :safari, :ie, :chrome, :other
53
+ # end
54
+ # MyEnum[1].to_s # => "safari"
55
+ #
56
+ #
57
+ def self.enumerates *names
58
+ self.names = names.map(&:to_s)
59
+ end
60
+
61
+ def self.to_sql_str
62
+ "ENUM('#{names.join("', '")}')"
63
+ end
64
+
65
+ def self.typify
66
+ 'chararray'
67
+ end
68
+ end
69
+
70
+
71
+ #
72
+ # Note that bin 0 is
73
+ #
74
+ class Binned < Enum
75
+ class_inheritable_reader :bins, :empty_bin_name
76
+ @@empty_bin_name = '(none)'
77
+
78
+ def bins
79
+ self.class.bins
80
+ end
81
+
82
+ # FIXME -- doesn't respect a lower bound.
83
+ def initialize val
84
+ return super(val) if val.nil?
85
+ last_top = bins.first
86
+ bins.each_with_index do |bin_top, idx|
87
+ return super(idx) if val <= bin_top
88
+ end
89
+ return super(bins.length)
90
+ end
91
+
92
+ def self.enumerates *bins
93
+ options = bins.extract_options!
94
+ write_inheritable_attribute :bins, bins
95
+ last_top = bins.shift
96
+ # bins.unshift bins.first if last_top == -Infinity
97
+ names = bins.map do |bin_top|
98
+ name = bin_name last_top, bin_top, options
99
+ last_top = (last_top.is_a?(Integer) ? bin_top + 1 : bin_top)
100
+ name
101
+ end
102
+ super(*names)
103
+ end
104
+
105
+ #
106
+ # Bins
107
+ #
108
+ def self.bin_name lo_val, hi_val, options = { }
109
+ # case lo_val
110
+ # when Integer then lo_val = [lo_val+1, hi_val].compact.min
111
+ # end
112
+ case
113
+ when lo_val == -Infinity then "< #{hi_val}"
114
+ when hi_val == Infinity then "#{lo_val}+"
115
+ when (lo_val == hi_val) then lo_val
116
+ else "#{lo_val} - #{hi_val}"
117
+ end
118
+ end
119
+
120
+ end
121
+ end
122
+ end
123
+
data/lib/wukong/dfs.rb ADDED
@@ -0,0 +1,80 @@
1
+ require 'time' # ain't it always that way
2
+ module Wukong
3
+ module Dfs
4
+ def self.list_files dfs_path
5
+ Log.info{ "DFS: listing #{dfs_path}" }
6
+ listing = `hadoop dfs -ls #{dfs_path}`.split("\n").reject{|ls_line| ls_line =~ /Found \d+ items/i}
7
+ listing.map{|ls_line| HFile.new_from_ls(ls_line)}
8
+ end
9
+
10
+ #
11
+ # FIXME -- this will fail if multiple files in a listing have the
12
+ # same basename. Sorry.
13
+ #
14
+ def self.compare_listings src_files, dest_files, &block
15
+ src_files.sort.each do |src_file|
16
+ dest_file = dest_files.find{|df| File.basename(src_file) == df.basename }
17
+ case
18
+ when (! dest_file) then yield :missing, src_file, nil
19
+ when (! dest_file.kinda_equal(src_file)) then yield :differ, src_file, dest_file
20
+ else yield :same, src_file, dest_file
21
+ end
22
+ end
23
+ end
24
+
25
+ class HFile < TypedStruct.new(
26
+ [:mode_str, String],
27
+ [:i_count, String],
28
+ [:owner, String],
29
+ [:group, String],
30
+ [:size, Integer],
31
+ [:date, Bignum],
32
+ [:path, String]
33
+ )
34
+ def self.new_from_ls ls_line
35
+ mode, ic, o, g, sz, dt, tm, path = ls_line.chomp.split(/\s+/)
36
+ date = Time.parse("#{dt} #{tm}").utc.to_flat
37
+ new mode, ic.to_i, o, g, sz.to_i, date, path
38
+ end
39
+ def dirname
40
+ @dirname ||= File.dirname(path)
41
+ end
42
+ def basename
43
+ @basename ||= File.basename(path)
44
+ end
45
+ #
46
+ # Two files are kinda_equal if they match in size and if
47
+ # the hdfs version is later than the filesystem version.
48
+ #
49
+ def kinda_equal file
50
+ (self.size == File.size(file)) # && (self.date >= File.mtime(file).utc.to_flat)
51
+ end
52
+ def to_s
53
+ to_a.join("\t")
54
+ end
55
+
56
+ #
57
+ # These will be very slow.
58
+ # If some kind soul will integrate JRuby callouts the bards shall
59
+ # celebrate your name evermore.
60
+ #
61
+
62
+ # rename the file on the HDFS
63
+ def mv new_filename
64
+ self.class.run_dfs_command :mv, path, new_filename
65
+ end
66
+
67
+ def self.mkdir dirname
68
+ run_dfs_command :mkdir, dirname
69
+ end
70
+ def self.mkdir_p(*args) self.mkdir *args ; end # HDFS is always -p
71
+
72
+ def self.run_dfs_command *args
73
+ cmd = 'hadoop dfs -'+ args.flatten.compact.join(" ")
74
+ Log.debug{ "DFS: Running #{cmd}" }
75
+ Log.info{ `#{cmd} 2>&1`.gsub(/[\r\n\t]+/, " ") }
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,111 @@
1
+ require 'htmlentities'
2
+ require 'addressable/uri'
3
+
4
+ module Wukong
5
+ #
6
+ # By default (or explicitly with the :xml strategy), convert string to
7
+ # * XML-encoded ASCII,
8
+ #
9
+ # * with a guarantee that the characters " quote, ' apos \\ backslash,
10
+ # carriage-return \r newline \n and tab \t (as well as all other control
11
+ # characters) are encoded.
12
+ #
13
+ # * Any XML-encoding in the original text is encoded with no introspection:
14
+ # encode_str("&lt;a href=\"foo\"&gt;")
15
+ # # => "&amp;lt;a href=&quot;foo&quot;&amp;gt;"
16
+ #
17
+ # * Useful: http://rishida.net/scripts/uniview/conversion.php
18
+ #
19
+ # With the :url strategy,
20
+ # * URL-encode the string
21
+ # * This is as strict as possible: encodes all but alphanumeric and _ underscore.
22
+ # The resulting string is thus XML- and URL-safe.
23
+ # http://addressable.rubyforge.org/api/classes/Addressable/URI.html#M000010
24
+ #
25
+ # Wukong.decode_str(Wukong.encode_str(str)) returns the original str
26
+ #
27
+ #
28
+ #
29
+ def self.encode_str str, strategy=:xml
30
+ begin
31
+ case strategy
32
+ when :xml then self.html_encoder.encode(str, :basic, :named, :decimal).gsub(/\\/, '&#x5C;')
33
+ when :url then Addressable::URI.encode_component(str, /[^\w]/)
34
+ else raise "Don't know how to encode with strategy #{strategy}"
35
+ end
36
+ rescue ArgumentError => e
37
+ str.gsub!(/[^\w\s\.\-@#%]+/, '')
38
+ '!!bad_encoding!! ' + str
39
+ end
40
+ end
41
+ # HTMLEntities encoder instance
42
+ def self.html_encoder
43
+ @html_encoder ||= HTMLEntities.new
44
+ end
45
+
46
+ #
47
+ # Decode string from its encode_str representation. This can include
48
+ # dangerous things such as tabs, newlines, backslashes and cryptofascist
49
+ # propaganda.
50
+ #
51
+ def self.decode_str str, strategy=:xml
52
+ case strategy
53
+ when :xml then HTMLEntities.decode_entities(str)
54
+ when :url then Addressable::URI.unencode_component(str)
55
+ else raise "Don't know how to decode with strategy #{strategy}"
56
+ end
57
+ end
58
+
59
+ #
60
+ # Replace each given field in the hash with its
61
+ # encoded value
62
+ #
63
+ def self.encode_components hsh, *fields
64
+ fields.each do |field|
65
+ hsh[field] = hsh[field].to_s.wukong_encode if hsh[field]
66
+ end
67
+ end
68
+ end
69
+
70
+ String.class_eval do
71
+
72
+ #
73
+ # Strip control characters that might harsh our buzz, TSV-wise
74
+ # See Wukong.encode_str
75
+ #
76
+ def wukong_encode! *args
77
+ replace self.wukong_encode(*args)
78
+ end
79
+
80
+ def wukong_encode(*args)
81
+ Wukong.encode_str(self, *args)
82
+ end
83
+
84
+ #
85
+ # Decode string into original (and possibly unsafe) form
86
+ # See Wukong.encode_str and Wukong.decode_str
87
+ #
88
+ def wukong_decode!(*args)
89
+ replace self.wukong_decode(*args)
90
+ end
91
+
92
+ def wukong_decode
93
+ Wukong.decode_str(self, *args)
94
+ end
95
+
96
+ #
97
+ # Takes an XML-encoded or plaintext string and forces it into canonical encoding
98
+ #
99
+ def wukong_recode!(*args)
100
+ replace self.wukong_decode(*args).wukong_encode(*args)
101
+ end
102
+ def wukong_recode
103
+ Wukong.encode_str(Wukong.decode_str(self, *args), *args)
104
+ end
105
+ end
106
+
107
+ Struct.class_eval do
108
+ def recode!(*args)
109
+ each_pair{|k,v| v.wukong_recode!(*args) if (v && v.respond_to?(:wukong_recode!)) }
110
+ end
111
+ end
@@ -0,0 +1,15 @@
1
+ #
2
+ # These pull in the minimal functionality of the extlib|activesupport family of
3
+ # gems.
4
+ #
5
+ require 'wukong/extensions/blank'
6
+ require 'wukong/extensions/class'
7
+ require 'wukong/extensions/symbol'
8
+ require 'wukong/extensions/hash'
9
+ require 'wukong/extensions/hash_like'
10
+ require 'wukong/extensions/array'
11
+ require 'wukong/extensions/struct'
12
+ require 'wukong/extensions/module'
13
+ require 'wukong/extensions/string'
14
+ require 'wukong/extensions/date_time'
15
+ require 'wukong/extensions/emittable'
@@ -0,0 +1,18 @@
1
+ class Array
2
+ #
3
+ # The following is taken in whole from the extlib gem. Thanks y'all.
4
+ #
5
+
6
+ # Extracts options from a set of arguments. Removes and returns the last
7
+ # element in the array if it's a hash, otherwise returns a blank hash.
8
+ #
9
+ # def options(*args)
10
+ # args.extract_options!
11
+ # end
12
+ #
13
+ # options(1, 2) # => {}
14
+ # options(1, 2, :a => :b) # => {:a=>:b}
15
+ def extract_options!
16
+ last.is_a?(::Hash) ? pop : {}
17
+ end
18
+ end
@@ -0,0 +1,93 @@
1
+ #
2
+ # This is taken in whole from the extlib gem. Thanks y'all.
3
+ #
4
+
5
+ class Object
6
+ ##
7
+ # Returns true if the object is nil or empty (if applicable)
8
+ #
9
+ # [].blank? #=> true
10
+ # [1].blank? #=> false
11
+ # [nil].blank? #=> false
12
+ #
13
+ # @return [TrueClass, FalseClass]
14
+ #
15
+ # @api public
16
+ def blank?
17
+ nil? || (respond_to?(:empty?) && empty?)
18
+ end
19
+ end # class Object
20
+
21
+ class Numeric
22
+ ##
23
+ # Numerics are never blank
24
+ #
25
+ # 0.blank? #=> false
26
+ # 1.blank? #=> false
27
+ # 6.54321.blank? #=> false
28
+ #
29
+ # @return [FalseClass]
30
+ #
31
+ # @api public
32
+ def blank?
33
+ false
34
+ end
35
+ end # class Numeric
36
+
37
+ class NilClass
38
+ ##
39
+ # Nil is always blank
40
+ #
41
+ # nil.blank? #=> true
42
+ #
43
+ # @return [TrueClass]
44
+ #
45
+ # @api public
46
+ def blank?
47
+ true
48
+ end
49
+ end # class NilClass
50
+
51
+ class TrueClass
52
+ ##
53
+ # True is never blank.
54
+ #
55
+ # true.blank? #=> false
56
+ #
57
+ # @return [FalseClass]
58
+ #
59
+ # @api public
60
+ def blank?
61
+ false
62
+ end
63
+ end # class TrueClass
64
+
65
+ class FalseClass
66
+ ##
67
+ # False is always blank.
68
+ #
69
+ # false.blank? #=> true
70
+ #
71
+ # @return [TrueClass]
72
+ #
73
+ # @api public
74
+ def blank?
75
+ true
76
+ end
77
+ end # class FalseClass
78
+
79
+ class String
80
+ ##
81
+ # Strips out whitespace then tests if the string is empty.
82
+ #
83
+ # "".blank? #=> true
84
+ # " ".blank? #=> true
85
+ # " hey ho ".blank? #=> false
86
+ #
87
+ # @return [TrueClass, FalseClass]
88
+ #
89
+ # @api public
90
+ def blank?
91
+ strip.empty?
92
+ end
93
+ end # class String