wukong 1.4.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/README.textile +34 -7
  2. data/bin/cutc +1 -1
  3. data/bin/cuttab +1 -1
  4. data/bin/greptrue +1 -3
  5. data/bin/hdp-cat +1 -1
  6. data/bin/hdp-catd +1 -1
  7. data/bin/hdp-du +11 -6
  8. data/bin/hdp-get +1 -1
  9. data/bin/hdp-kill +1 -1
  10. data/bin/hdp-ls +1 -1
  11. data/bin/hdp-mkdir +1 -1
  12. data/bin/hdp-mv +1 -1
  13. data/bin/hdp-ps +1 -1
  14. data/bin/hdp-put +1 -1
  15. data/bin/hdp-rm +1 -1
  16. data/bin/hdp-sort +39 -19
  17. data/bin/hdp-stream +39 -19
  18. data/bin/hdp-stream-flat +9 -5
  19. data/bin/hdp-stream2 +39 -0
  20. data/bin/tabchar +1 -1
  21. data/bin/wu-date +13 -0
  22. data/bin/wu-datetime +13 -0
  23. data/bin/wu-plus +9 -0
  24. data/docpages/INSTALL.textile +0 -2
  25. data/docpages/index.textile +4 -2
  26. data/examples/apache_log_parser.rb +26 -14
  27. data/examples/graph/gen_symmetric_links.rb +10 -0
  28. data/examples/sample_records.rb +6 -8
  29. data/lib/wukong/datatypes/enum.rb +2 -2
  30. data/lib/wukong/dfs.rb +10 -9
  31. data/lib/wukong/encoding.rb +22 -4
  32. data/lib/wukong/extensions/emittable.rb +1 -1
  33. data/lib/wukong/extensions/hash_keys.rb +16 -0
  34. data/lib/wukong/extensions/hash_like.rb +17 -0
  35. data/lib/wukong/models/graph.rb +18 -20
  36. data/lib/wukong/schema.rb +13 -11
  37. data/lib/wukong/script.rb +26 -8
  38. data/lib/wukong/script/hadoop_command.rb +108 -2
  39. data/lib/wukong/streamer.rb +2 -0
  40. data/lib/wukong/streamer/base.rb +1 -0
  41. data/lib/wukong/streamer/record_streamer.rb +14 -0
  42. data/lib/wukong/streamer/struct_streamer.rb +2 -2
  43. data/spec/data/a_atsigns_b.tsv +64 -0
  44. data/spec/data/a_follows_b.tsv +53 -0
  45. data/spec/data/tweet.tsv +167 -0
  46. data/spec/data/twitter_user.tsv +55 -0
  47. data/wukong.gemspec +13 -3
  48. metadata +13 -23
@@ -27,8 +27,6 @@ pre. $ sudo gem install {{ site.gemname }} --source=http://gemcutter.org
27
27
 
28
28
  You can instead download this project in either "zip":http://github.com/mrflip/{{ site.gemname }}/zipball/master or "tar":http://github.com/mrflip/{{ site.gemname }}/tarball/master formats.
29
29
 
30
- <notextile></div><div class="toggle"></notextile>
31
-
32
30
  h3. Get the Dependencies
33
31
 
34
32
  * Hadoop
@@ -71,7 +71,7 @@ h2. Documentation index
71
71
 
72
72
  h2. Credits
73
73
 
74
- Wukong was written by "Philip (flip) Kromer":http://mrflip.com (flip@infochimps.org) for the "infochimps project":http://infochimps.org
74
+ Wukong was written by "Philip (flip) Kromer":http://mrflip.com (flip@infochimps.org / "@mrflip":http://twitter.com/mrflip) for the "infochimps project":http://infochimps.org
75
75
 
76
76
  Patches submitted by:
77
77
  * gemified by Ben Woosley (ben.woosley@gmail.com)
@@ -81,10 +81,12 @@ Thanks to:
81
81
  * "Brad Heintz":http://www.bradheintz.com/no1thing/talks/ for his early feedback
82
82
  * "Phil Ripperger":http://blog.pdatasolutions.com for his "wukong in the Amazon AWS cloud":http://blog.pdatasolutions.com/post/191978092/ruby-on-hadoop-quickstart tutorial.
83
83
 
84
- <notextile><div class="toggle"></notextile>
84
+ <notextile></div><div class="toggle"></notextile>
85
85
 
86
86
  h2. Help!
87
87
 
88
88
  Send Wukong questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
89
89
 
90
90
  <notextile></div></notextile>
91
+
92
+ {% include news.html %}
@@ -5,21 +5,21 @@ require 'wukong'
5
5
  module ApacheLogParser
6
6
  class Mapper < Wukong::Streamer::LineStreamer
7
7
 
8
+ # regular expression for apache-style log lines
9
+ # note that we strip out the google analytics listener.
10
+ LOG_RE = %r{\A
11
+ (\d+\.\d+\.\d+\.\d+) # IP addr
12
+ \s([^\s]+)\s # -
13
+ \s([^\s]+) # -
14
+ \s\[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] # [07/Jun/2008:20:37:11 +0000]
15
+ \s(\d+) # 400
16
+ \s"([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" # "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1"
17
+ \s(\d+) # 173
18
+ \s"([^\"]*)" "([^\"]*)" "([^\"]*)" # "-" "-" "-"
19
+ \z}x
8
20
 
9
- def parse_request req
10
- m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
11
- if m
12
- [''] + m.captures
13
- else
14
- [req, '', '', '']
15
- end
16
- end
17
-
18
-
19
- # regular expression to match on apache-style log lines
20
- # IP addr - - [07/Jun/2008:20:37:11 +0000] 400 "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" 173 "-" "-" "-"
21
- LOG_RE = %r{\A(\d+\.\d+\.\d+\.\d+) ([^\s]+) ([^\s]+) \[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] (\d+) "([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" (\d+) "([^\"]*)" "([^\"]*)" "([^\"]*)"\z}
22
-
21
+ # Use the regex to break line into fields
22
+ # Emit each record as flat line
23
23
  def process line
24
24
  line.chomp
25
25
  m = LOG_RE.match(line)
@@ -32,8 +32,20 @@ module ApacheLogParser
32
32
  yield [:unparseable, line]
33
33
  end
34
34
  end
35
+
36
+
37
+ def parse_request req
38
+ m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
39
+ if m
40
+ [''] + m.captures
41
+ else
42
+ [req, '', '', '']
43
+ end
44
+ end
45
+
35
46
  end
36
47
 
48
+
37
49
  class Reducer < Wukong::Streamer::LineStreamer
38
50
  end
39
51
 
@@ -17,6 +17,16 @@ module Wukong::Streamer
17
17
  end
18
18
  end
19
19
 
20
+ #
21
+ # Find symmetric links
22
+ #
23
+ # Takes adjacency list for a directed graph and emits only edges where
24
+ # A->B and B->A
25
+ #
26
+ # The output will list each such symmetric edge exactly once as
27
+ # a_symmetric_b node1 node2
28
+ # where node1 is lexicographically less than node2.
29
+ #
20
30
  module FindSymmetricLinks
21
31
 
22
32
  class Mapper < Wukong::Streamer::EdgeStreamer
@@ -8,7 +8,7 @@ require 'wukong'
8
8
  # Set the sampling fraction at the command line using the
9
9
  # --sampling_fraction=
10
10
  # option: for example, to take a random 1/1000th of the lines in huge_files,
11
- # ./examples/sample_records.rb --sampling_fraction=0.001 --go huge_files sampled_files
11
+ # ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
12
12
  #
13
13
  class Mapper < Wukong::Streamer::LineStreamer
14
14
  include Wukong::Streamer::Filter
@@ -32,13 +32,11 @@ class Mapper < Wukong::Streamer::LineStreamer
32
32
  end
33
33
  end
34
34
 
35
- class Script < Wukong::Script
36
- def default_options
37
- super.merge :reduce_tasks => 0
38
- end
39
- end
40
-
41
35
  #
42
36
  # Executes the script
43
37
  #
44
- Script.new( Mapper, nil ).run
38
+ Wukong::Script.new( Mapper,
39
+ nil,
40
+ :reduce_tasks => 0,
41
+ :reuse_jvms => true
42
+ ).run
@@ -76,7 +76,7 @@ module Wukong
76
76
  # Note that bin 0 is
77
77
  #
78
78
  class Binned < Enum
79
- class_inheritable_reader :bins, :empty_bin_name
79
+ class_inheritable_accessor :bins, :empty_bin_name
80
80
  @@empty_bin_name = '(none)'
81
81
 
82
82
  def bins
@@ -95,7 +95,7 @@ module Wukong
95
95
 
96
96
  def self.enumerates *bins
97
97
  options = bins.extract_options!
98
- write_inheritable_attribute :bins, bins
98
+ self.bins = bins
99
99
  last_top = bins.shift
100
100
  # bins.unshift bins.first if last_top == -Infinity
101
101
  names = bins.map do |bin_top|
@@ -22,15 +22,16 @@ module Wukong
22
22
  end
23
23
  end
24
24
 
25
- class HFile < TypedStruct.new(
26
- [:mode_str, String],
27
- [:i_count, String],
28
- [:owner, String],
29
- [:group, String],
30
- [:size, Integer],
31
- [:date, Bignum],
32
- [:path, String]
33
- )
25
+ HFile = TypedStruct.new(
26
+ [:mode_str, String],
27
+ [:i_count, String],
28
+ [:owner, String],
29
+ [:group, String],
30
+ [:size, Integer],
31
+ [:date, Bignum],
32
+ [:path, String]
33
+ )
34
+ HFile.class_eval do
34
35
  def self.new_from_ls ls_line
35
36
  mode, ic, o, g, sz, dt, tm, path = ls_line.chomp.split(/\s+/)
36
37
  date = Time.parse("#{dt} #{tm}").utc.to_flat
@@ -1,6 +1,23 @@
1
1
  require 'htmlentities'
2
2
  require 'addressable/uri'
3
3
 
4
+ # Fix a bug (?) in the HTMLEntities encoder class with $KCODE='NONE'
5
+ HTMLEntities::Encoder.class_eval do
6
+ private
7
+ def extended_entity_regexp
8
+ @extended_entity_regexp ||= (
9
+ if encoding_aware?
10
+ regexp = '[^\u{20}-\u{7E}]'
11
+ else
12
+ # regexp = '[^\x20-\x7E]'
13
+ regexp = '[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+'
14
+ end
15
+ regexp += "|'" if @flavor == 'html4'
16
+ Regexp.new(regexp)
17
+ )
18
+ end
19
+ end
20
+
4
21
  module Wukong
5
22
  #
6
23
  # By default (or explicitly with the :xml strategy), convert string to
@@ -24,7 +41,9 @@ module Wukong
24
41
  #
25
42
  # Wukong.decode_str(Wukong.encode_str(str)) returns the original str
26
43
  #
27
- #
44
+ # If you're seeing bad_encoding errors, try
45
+ # $KCODE='u' unless "1.9".respond_to?(:encoding)
46
+ # at the start of your script.
28
47
  #
29
48
  def self.encode_str str, strategy=:xml
30
49
  begin
@@ -34,8 +53,7 @@ module Wukong
34
53
  else raise "Don't know how to encode with strategy #{strategy}"
35
54
  end
36
55
  rescue ArgumentError => e
37
- str.gsub!(/[^\w\s\.\-@#%]+/, '')
38
- '!!bad_encoding!! ' + str
56
+ '!bad_encoding!! ' + str.gsub(/[^\w\s\.\-@#%]+/, '')
39
57
  end
40
58
  end
41
59
  # HTMLEntities encoder instance
@@ -89,7 +107,7 @@ String.class_eval do
89
107
  replace self.wukong_decode(*args)
90
108
  end
91
109
 
92
- def wukong_decode
110
+ def wukong_decode(*args)
93
111
  Wukong.decode_str(self, *args)
94
112
  end
95
113
 
@@ -18,7 +18,7 @@ Struct.class_eval do
18
18
  #
19
19
  # Flatten for packing as resource name followed by all fields
20
20
  #
21
- def to_flat include_key=true
21
+ def to_flat include_key=false
22
22
  if include_key.is_a? Proc
23
23
  sort_key = include_key.call(self)
24
24
  elsif (! include_key.blank?) && respond_to?(:key)
@@ -0,0 +1,16 @@
1
+ class Hash
2
+
3
+ # Return a new hash with all keys converted to symbols.
4
+ def symbolize_keys
5
+ inject({}) do |options, (key, value)|
6
+ options[(key.to_sym rescue key) || key] = value
7
+ options
8
+ end
9
+ end
10
+
11
+ # Destructively convert all keys to symbols.
12
+ def symbolize_keys!
13
+ self.replace(self.symbolize_keys)
14
+ end
15
+
16
+ end
@@ -114,6 +114,23 @@ module Wukong
114
114
  extend ClassMethods
115
115
  end
116
116
  end
117
+
118
+ def coerce_attr attr, coerce_blank_to_nil=false, &block
119
+ orig_val = self.send(attr)
120
+ new_val = (coerce_blank_to_nil && orig_val.blank?) ? nil : block.call(orig_val)
121
+ self.send("#{attr}=", new_val)
122
+ end
123
+
124
+ def coerce_to_int! attr, *args
125
+ coerce_attr(attr, *args) do |val|
126
+ val.to_i
127
+ end
128
+ end
129
+
130
+ def coerce_to_date! attr, *args
131
+ coerce_attr(attr, *args){|val| val.is_a?(DateTime) ? val : DateTime.parse(val) rescue nil }
132
+ end
133
+
117
134
  end
118
135
 
119
136
  end
@@ -1,27 +1,25 @@
1
1
 
2
2
  module Wukong
3
3
  module Models
4
- class Edge < TypedStruct.new(
5
- [:src, Integer],
6
- [:dest, Integer]
7
- )
8
- end
4
+ Edge = TypedStruct.new(
5
+ [:src, Integer],
6
+ [:dest, Integer]
7
+ )
9
8
 
10
- class MultiEdge < TypedStruct.new(
11
- [:src, Integer],
12
- [:dest, Integer],
13
- [:a_follows_b, Integer],
14
- [:b_follows_a, Integer],
15
- [:a_replies_b, Integer],
16
- [:b_replies_a, Integer],
17
- [:a_atsigns_b, Integer],
18
- [:b_atsigns_a, Integer],
19
- [:a_retweets_b, Integer],
20
- [:b_retweets_a, Integer],
21
- [:a_favorites_b, Integer],
22
- [:b_favorites_a, Integer]
23
- )
24
- end
9
+ MultiEdge = TypedStruct.new(
10
+ [:src, Integer],
11
+ [:dest, Integer],
12
+ [:a_follows_b, Integer],
13
+ [:b_follows_a, Integer],
14
+ [:a_replies_b, Integer],
15
+ [:b_replies_a, Integer],
16
+ [:a_atsigns_b, Integer],
17
+ [:b_atsigns_a, Integer],
18
+ [:a_retweets_b, Integer],
19
+ [:b_retweets_a, Integer],
20
+ [:a_favorites_b, Integer],
21
+ [:b_favorites_a, Integer]
22
+ )
25
23
 
26
24
  end
27
25
  end
@@ -106,11 +106,11 @@ module Wukong
106
106
  # should, follow with an immediate GENERATE to ditch that field.)
107
107
  #
108
108
  def pig_load filename=nil
109
- filename ||= table_name+'.tsv'
109
+ filename ||= resource_name.to_s+'.tsv'
110
110
  cmd = [
111
- "%-23s" % resource_name,
112
- "= LOAD", filename,
113
- "AS ( rsrc:chararray,", self.to_pig, ')',
111
+ "%-23s" % self.to_s.gsub(/^.*\W/, ""),
112
+ "= LOAD '#{filename}'",
113
+ "AS ( rsrc:chararray,", self.to_pig, ') ;',
114
114
  ].join(" ")
115
115
  end
116
116
 
@@ -125,7 +125,7 @@ module Wukong
125
125
  sql_str = []
126
126
  members.zip(mtypes).each do |attr, type|
127
127
  type_str = type.respond_to?(:to_sql) ? type.to_sql : type.to_s.upcase
128
- sql_str << " %-21s\t%s" %["`#{attr}`", type_str]
128
+ sql_str << " %-29s\t%s" %["`#{attr}`", type_str]
129
129
  end
130
130
  sql_str.join(",\n")
131
131
  end
@@ -184,13 +184,15 @@ module Wukong
184
184
  # different objects jumbled together, you can just dump in the whole file,
185
185
  # landing each object in its correct table.
186
186
  #
187
- def sql_load_mysql
187
+ def sql_load_mysql(filename=nil)
188
+ filename ||= ":resource_name.tsv"
189
+ filename.gsub!(/:resource_name/, self.table_name)
188
190
  str = []
189
191
  # disable indexing during bulk load
190
- str << %Q{ALTER TABLE `#{self.resource_name}` DISABLE KEYS; }
192
+ str << %Q{ALTER TABLE `#{self.table_name}` DISABLE KEYS; }
191
193
  # Bulk load the tab-separated-values file.
192
- str << %Q{LOAD DATA LOCAL INFILE '#{self.resource_name}.tsv'}
193
- str << %Q{ REPLACE INTO TABLE `#{self.resource_name}` }
194
+ str << %Q{LOAD DATA LOCAL INFILE '#{filename}'}
195
+ str << %Q{ REPLACE INTO TABLE `#{self.table_name}` }
194
196
  str << %Q{ COLUMNS }
195
197
  str << %Q{ TERMINATED BY '\\t' }
196
198
  str << %Q{ OPTIONALLY ENCLOSED BY '' }
@@ -200,9 +202,9 @@ module Wukong
200
202
  str << ' '+self.sql_members
201
203
  str << %Q{\n ); }
202
204
  # Re-enable indexing
203
- str << %Q{ALTER TABLE `#{self.resource_name}` ENABLE KEYS ; }
205
+ str << %Q{ALTER TABLE `#{self.table_name}` ENABLE KEYS ; }
204
206
  # Show it loaded correctly
205
- str << %Q{SELECT '#{self.resource_name}', NOW(), COUNT(*) FROM `#{self.resource_name}`; }
207
+ str << %Q{SELECT NOW(), COUNT(*), '#{self.table_name}' FROM `#{self.table_name}`; }
206
208
  str.join("\n")
207
209
  end
208
210
 
@@ -124,12 +124,13 @@ module Wukong
124
124
  #
125
125
  def process_argv!
126
126
  options[:all_args] = []
127
- args = ARGV.dup
128
- while args do
127
+ options[:rest] = []
128
+ args = ARGV.dup
129
+ while (! args.blank?) do
129
130
  arg = args.shift
130
131
  case
131
132
  when arg == '--'
132
- break
133
+ options[:rest] += args
133
134
  when arg =~ /\A--(\w+)(?:=(.+))?\z/
134
135
  opt, val = [$1, $2]
135
136
  opt = opt.to_sym
@@ -137,11 +138,12 @@ module Wukong
137
138
  self.options[opt] = val
138
139
  options[:all_args] << arg unless std_options.include?(opt)
139
140
  else
140
- args.unshift(arg) ; break
141
+ options[:all_args] << arg
142
+ options[:rest] << arg
141
143
  end
144
+ # p [options, arg, args]
142
145
  end
143
146
  options[:all_args] = options[:all_args].join(" ")
144
- options[:rest] = args
145
147
  end
146
148
 
147
149
  def this_script_filename
@@ -203,7 +205,7 @@ module Wukong
203
205
  def input_output_paths
204
206
  # input / output paths
205
207
  input_path, output_path = options[:rest][0..1]
206
- raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:fake]) && (input_path.blank? || output_path.blank?)
208
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
207
209
  [input_path, output_path]
208
210
  end
209
211
 
@@ -223,7 +225,7 @@ module Wukong
223
225
  maybe_overwrite_output_paths! output_path
224
226
  command = runner_command(input_path, output_path)
225
227
  $stderr.puts command
226
- if ! options[:fake]
228
+ unless options[:dry_run]
227
229
  $stdout.puts `#{command}`
228
230
  end
229
231
  end
@@ -255,8 +257,24 @@ module Wukong
255
257
  #{$0} --run=local input_hdfs_path output_hdfs_dir # run the script on local filesystem using unix pipes
256
258
  #{$0} --run input_hdfs_path output_hdfs_dir # run the script with the mode given in config/wukong*.yaml
257
259
  #{$0} --map
258
- #{$0} --reduce # dispatch to the mapper or reducer
260
+ #{$0} --reduce # dispatch to the mapper or reducer
259
261
 
262
+ All flags must precede the input and output paths.
263
+ Additional flags:
264
+ --dry_run
265
+ Hadoop Options (see hadoop documentation)
266
+ --max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
267
+ --max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
268
+ --map_tasks => 'mapred.map.tasks',
269
+ --reduce_tasks => 'mapred.reduce.tasks',
270
+ --sort_fields => 'stream.num.map.output.key.fields',
271
+ --key_field_separator => 'map.output.key.field.separator',
272
+ --partition_fields => 'num.key.fields.for.partition',
273
+ --output_field_separator => 'stream.map.output.field.separator',
274
+ --map_speculative => 'mapred.map.tasks.speculative.execution',
275
+ --timeout => 'mapred.task.timeout',
276
+ --reuse_jvms => 'mapred.job.reuse.jvm.num.tasks',
277
+ --ignore_exit_status => 'stream.non.zero.exit.status.is.failure',
260
278
  You can specify as well arbitrary script-specific command line flags; they are added to your options[] hash.
261
279
  }
262
280
  end