wukong 1.4.7 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/CHANGELOG.textile +9 -0
  2. data/README.textile +1 -1
  3. data/bin/hdp-bzip +28 -0
  4. data/bin/hdp-mkdir +1 -1
  5. data/bin/hdp-stream-flat +3 -2
  6. data/bin/wu-lign +32 -18
  7. data/docpages/pig/cookbook.html +481 -0
  8. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  9. data/docpages/pig/images/instruction_arrow.png +0 -0
  10. data/docpages/pig/images/pig-logo.gif +0 -0
  11. data/docpages/pig/piglatin_ref1.html +1103 -0
  12. data/docpages/pig/piglatin_ref2.html +14340 -0
  13. data/docpages/pig/setup.html +505 -0
  14. data/docpages/pig/skin/basic.css +166 -0
  15. data/docpages/pig/skin/breadcrumbs.js +237 -0
  16. data/docpages/pig/skin/fontsize.js +166 -0
  17. data/docpages/pig/skin/getBlank.js +40 -0
  18. data/docpages/pig/skin/getMenu.js +45 -0
  19. data/docpages/pig/skin/images/chapter.gif +0 -0
  20. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  21. data/docpages/pig/skin/images/current.gif +0 -0
  22. data/docpages/pig/skin/images/external-link.gif +0 -0
  23. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  24. data/docpages/pig/skin/images/page.gif +0 -0
  25. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  26. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  27. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  28. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  29. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  30. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  31. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  32. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  33. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  34. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  35. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  36. data/docpages/pig/skin/print.css +54 -0
  37. data/docpages/pig/skin/profile.css +181 -0
  38. data/docpages/pig/skin/screen.css +587 -0
  39. data/docpages/pig/tutorial.html +1059 -0
  40. data/docpages/pig/udf.html +1509 -0
  41. data/examples/keystore/conditional_outputter_example.rb +70 -0
  42. data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
  43. data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
  44. data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
  45. data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
  46. data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
  47. data/examples/pagerank/run_pagerank.sh +10 -8
  48. data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
  49. data/examples/stupidly_simple_filter.rb +43 -0
  50. data/lib/wukong/extensions/hash.rb +13 -0
  51. data/lib/wukong/extensions/hash_like.rb +7 -0
  52. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
  53. data/lib/wukong/script.rb +27 -22
  54. data/lib/wukong/script/hadoop_command.rb +5 -3
  55. data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
  56. data/wukong.gemspec +64 -26
  57. metadata +89 -31
  58. data/docpages/pig/PigLatinReferenceManual.html +0 -19134
  59. data/examples/foo.rb +0 -9
  60. data/examples/package-local.rb +0 -100
  61. data/examples/package.rb +0 -96
  62. data/examples/run_all.sh +0 -47
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'cassandra'
4
+ require 'wukong'
5
+ require 'wukong/encoding'
6
+ require 'wukong/keystore/cassandra_conditional_outputter'
7
+
8
+ #
9
+ # Usage:
10
+ # echo -e "bob has boobs ha ha ha" | ./examples/keystore/conditional_outputter_example.rb --map
11
+ #
12
+
13
+ CASSANDRA_KEYSPACE = 'CorpusAnalysis'
14
+
15
+ #
16
+ # This demonstrates the CassandraConditionalOutputter module.
17
+ #
18
+ # CassandraConditionalOutputter uses and a cassandra key-value store to
19
+ # track unique IDs and prevent output of any record already present in the
20
+ # database.
21
+ #
22
+ # For this example, it takes an input stream, generates all letter pairs for
23
+ # each line, and emits
24
+ #
25
+ #
26
+ class LetterPairMapper < Wukong::Streamer::LineStreamer
27
+ include CassandraConditionalOutputter
28
+
29
+ #
30
+ # A unique key for the given record. If an object with
31
+ # that key has been seen, it won't be re-emitted.
32
+ #
33
+ # In this example, we'll just encode the letter pair
34
+ #
35
+ def conditional_output_key record
36
+ record.to_s.wukong_encode(:url)
37
+ end
38
+
39
+ #
40
+ # Emit each letter pair in the line.
41
+ # the CassandraConditionalOutputter will swallow all duplicate lines.
42
+ #
43
+ def process line, &block
44
+ letter_pairs(line).each do |pair|
45
+ yield(pair)
46
+ end
47
+ end
48
+
49
+ # turn a string into the pairs of adjacent letters
50
+ #
51
+ # @example
52
+ # letter_pairs('abracadabra')
53
+ # # => ['ab', 'br',
54
+ def letter_pairs str, &block
55
+ chars = str.chars.to_a
56
+ chars[0..-2].zip(chars[1..-1]).map(&:join)
57
+ end
58
+
59
+ # Clear the entire cached keys column at the end of the run.
60
+ #
61
+ # You almost certainly don't want to do this in a real script.
62
+ #
63
+ def after_stream
64
+ $stderr.puts 'Clearing conditional_output_key cache...'
65
+ @key_cache.clear_column_family!(conditional_output_key_column)
66
+ end
67
+ end
68
+
69
+ # Execute the script
70
+ Wukong::Script.new( LetterPairMapper, nil ).run
File without changes
File without changes
File without changes
@@ -1,19 +1,21 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
3
  # Directory to pagerank on.
4
- work_dir=$1 ; shift
5
- if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
4
+ work_dir=$1 ; shift
5
+ if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank: $0 initial_dir [number_of_iterations] [start_iteration]" ; exit ; fi
6
+ # How many rounds to run: default 10
7
+ n_iters=${1-10} ; shift
8
+ # the iteration to start with: default 0
9
+ start_i=${1-0} ; shift
6
10
 
7
-
8
- # How many rounds to run
9
- max_iter=10
10
11
  # this directory
11
12
  script_dir="`dirname $0`"
12
13
 
13
- for (( curr=0 , next=1 ; "$curr" < "$max_iter" ; curr++ , next++ )) ; do
14
- curr_str=`printf "%03d" ${curr}`
15
- next_str=`printf "%03d" ${next}`
14
+ for (( iter=0 ; "$iter" < "$n_iters" ; iter++ )) ; do
15
+ curr_str=`printf "%03d" $(( $start_i + $iter ))`
16
+ next_str=`printf "%03d" $(( $start_i + $iter + 1 ))`
16
17
  curr_dir=$work_dir/pagerank_graph_${curr_str}
17
18
  next_dir=$work_dir/pagerank_graph_${next_str}
19
+ echo -e "Iteration $(( $iter + 1 )) / $n_iters:\t `basename $curr_dir` => `basename $next_dir`"
18
20
  $script_dir/pagerank.rb --rm --run $curr_dir $next_dir
19
21
  done
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong'
4
+
5
+ # Run as (local mode)
6
+ #
7
+ # ./examples/stupidly_simple_filter.rb --run=local input.tsv output.tsv
8
+ #
9
+ # for hadoop mode,
10
+ #
11
+ # ./examples/stupidly_simple_filter.rb --run=hadoop input.tsv output.tsv
12
+ #
13
+ # For debugging, run
14
+ #
15
+ # cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
16
+ #
17
+
18
+ #
19
+ # A very simple mapper -- looks for a regex match in one field,
20
+ # and emits the whole record if the field matches
21
+ #
22
+ class GrepMapper < Wukong::Streamer::RecordStreamer
23
+
24
+ MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
25
+
26
+ #
27
+ # Given a series of records like:
28
+ #
29
+ # tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
30
+ # tweet 123456789 20100102030405 @jerry, I'm having your baby
31
+ #
32
+ # emits only the lines matching that regex
33
+ #
34
+ def process rsrc, id, timestamp, text, *rest
35
+ yield [rsrc, id, timestamp, text, *rest] if line =~ MATCHER
36
+ end
37
+ end
38
+
39
+ # Execute the script
40
+ Wukong::Script.new(
41
+ GrepMapper,
42
+ nil
43
+ ).run
@@ -141,6 +141,19 @@ class Hash
141
141
  replace(compact)
142
142
  end
143
143
 
144
+ #
145
+ # remove all key-value pairs where the value is blank
146
+ #
147
+ def compact_blank
148
+ reject{|key,val| val.blank? }
149
+ end
150
+ #
151
+ # Replace the hash with its compact_blank'ed self
152
+ #
153
+ def compact_blank!
154
+ replace(compact_blank)
155
+ end
156
+
144
157
  # Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
145
158
  def reverse_merge(other_hash)
146
159
  other_hash.merge(self)
@@ -88,6 +88,13 @@ module Wukong
88
88
  merge hsh2, &Hash::DEEP_MERGER
89
89
  end
90
90
 
91
+ #
92
+ # remove all key-value pairs where the value is blank
93
+ #
94
+ def compact_blank
95
+ to_hash.compact_blank!
96
+ end
97
+
91
98
  module ClassMethods
92
99
  #
93
100
  # Instantiate an instance of the struct from a hash
@@ -0,0 +1,122 @@
1
+
2
+ #
3
+ # For a stream process that sees a significant number of duplicated heavyweight
4
+ # objects, it may be better to deduplicate them midflight (rather than, say,
5
+ # using a reducer to effectively `cat | sort | uniq` the data).
6
+ #
7
+ # This uses a cassandra key-value store to track unique IDs and prevent output
8
+ # of any record already present in the database. (Why cassandra? Because we use
9
+ # it in production. Might be nice to rewrite this example against redis or
10
+ # TokyoTyrant or something less demanding.)
11
+ #
12
+ # Things you have to do:
13
+ #
14
+ # * Override the conditional_output_key method to distinguish identical records
15
+ # * Define a constant CASSANDRA_KEYSPACE giving the Cassandra keyspace you're working in
16
+ # * (Optionally) override conditional_output_key_column
17
+ #
18
+ # * In your cassandra storage-conf.xml, add a column family to your keyspace:
19
+ #
20
+ # <Keyspace Name="CorpusAnalysis">
21
+ # <KeysCachedFraction>0.01</KeysCachedFraction>
22
+ #
23
+ # <!-- Added for CassandraConditionalOutputter -->
24
+ # <ColumnFamily CompareWith="UTF8Type" Name="LetterPairMapperKeys" />
25
+ #
26
+ # <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
27
+ # <ReplicationFactor>1</ReplicationFactor>
28
+ # <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
29
+ # </Keyspace>
30
+ #
31
+ # In this example, the CASSANDRA_KEYSPACE is 'CorpusAnalysis' and the
32
+ # conditional_output_key_column is 'LetterPairMapperKeys'
33
+ #
34
+ # @example
35
+ # Given
36
+ # tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
37
+ # tweet 24601 20100104136526 @jerry, I'm having your baby
38
+ # tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
39
+ # tweet 24601 20100104136526 @jerry, I'm having your baby
40
+ # tweet 1137 20100119234532 These pretzels are making me thirsty
41
+ # ....
42
+ # will emit:
43
+ # tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
44
+ # tweet 24601 20100104136526 @jerry, I'm having your baby
45
+ # tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
46
+ # tweet 24601 20100104136526 @jerry, I'm having your baby
47
+ # tweet 1137 20100119234532 These pretzels are making me thirsty
48
+ # ....
49
+ #
50
+ module CassandraConditionalOutputter
51
+
52
+ #
53
+ # A unique key for the given record. If an object with
54
+ # that key has been seen, it won't be re-emitted.
55
+ #
56
+ # You will almost certainly want to override this method in your subclass. Be
57
+ # sure that the key is a string, and is encoded properly (Cassandra likes to
58
+ # strip whitespace from keys, for instance).
59
+ #
60
+ def conditional_output_key record
61
+ record.to_s
62
+ end
63
+
64
+ #
65
+ # Checks each record against the key cache
66
+ # Swallows records already there,
67
+ #
68
+ #
69
+ def emit record, &block
70
+ key = conditional_output_key(record)
71
+ if should_emit?(record)
72
+ set_key(key, {'t' => record.timestamp})
73
+ super record
74
+ end
75
+ end
76
+
77
+ # Default. Emit record if its key is not already contained
78
+ # in the key-value store. Overwrite this as necessary
79
+ def should_emit? record
80
+ key = conditional_output_key(record)
81
+ !has_key?(key)
82
+ end
83
+
84
+ # Check for presence of key in the cache
85
+ def has_key? key
86
+ not key_cache.get(conditional_output_key_column, key).blank?
87
+ end
88
+
89
+ # register key in the key_cache
90
+ def set_key key, data={'t' => '0'}
91
+ key_cache.insert(conditional_output_key_column, key, data)
92
+ end
93
+
94
+ # nuke key from the key_cache
95
+ def remove_key key
96
+ key_cache.remove(conditional_output_key_column, key)
97
+ end
98
+
99
+ #
100
+ # Key cache implementation in Cassandra
101
+ #
102
+
103
+ # The cache
104
+ def key_cache
105
+ @key_cache ||= Cassandra.new(CASSANDRA_KEYSPACE)
106
+ end
107
+
108
+ # The column to use for the key cache. By default, the class name plus 'Keys',
109
+ # but feel free to override.
110
+ #
111
+ # @example
112
+ #
113
+ # class FooMapper < Wukong::Streamer::RecordStreamer
114
+ # include ConditionalOutputter
115
+ # end
116
+ # FooMapper.new.conditional_output_key_column
117
+ # # => 'FooMapperKeys'
118
+ #
119
+ def conditional_output_key_column
120
+ self.class.to_s+'Keys'
121
+ end
122
+ end
data/lib/wukong/script.rb CHANGED
@@ -82,6 +82,7 @@ module Wukong
82
82
  Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
83
83
  Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
84
84
  Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
85
+ Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
85
86
  Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
86
87
  Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
87
88
  Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
@@ -118,11 +119,11 @@ module Wukong
118
119
  # end
119
120
  # MyScript.new(MyMapper, nil).run
120
121
  #
121
- def initialize mapper_klass, reducer_klass, extra_options={}
122
+ def initialize mapper_klass, reducer_klass=nil, extra_options={}
122
123
  self.options = Settings.dup
123
- options.resolve!
124
- options.merge! self.default_options
125
- options.merge! extra_options
124
+ self.options.resolve!
125
+ self.options.merge! self.default_options
126
+ self.options.merge! extra_options
126
127
  self.mapper_klass = mapper_klass
127
128
  self.reducer_klass = reducer_klass
128
129
  # If no reducer_klass and no reduce_command, then skip the reduce phase
@@ -141,24 +142,29 @@ module Wukong
141
142
  end
142
143
 
143
144
  #
144
- # by default, call this script in --map mode
145
+ # Shell command for map phase. By default, calls the script in --map mode
146
+ # In hadoop mode, this is given to the hadoop streaming command.
147
+ # In local mode, it's given to the system() call
145
148
  #
146
149
  def map_command
147
- case
148
- when mapper_klass
149
- "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
150
- else options[:map_command] || options[:default_mapper] end
150
+ if mapper_klass
151
+ "#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
152
+ else
153
+ options[:map_command] || options[:default_mapper]
154
+ end
151
155
  end
152
156
 
153
157
  #
154
- # Shell command for reduce phase
155
- # by default, call this script in --reduce mode
158
+ # Shell command for reduce phase. By default, calls the script in --reduce mode
159
+ # In hadoop mode, this is given to the hadoop streaming command.
160
+ # In local mode, it's given to the system() call
156
161
  #
157
162
  def reduce_command
158
- case
159
- when reducer_klass
160
- "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
161
- else options[:reduce_command] || options[:default_reducer] end
163
+ if reducer_klass
164
+ "#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
165
+ else
166
+ options[:reduce_command]
167
+ end
162
168
  end
163
169
 
164
170
  #
@@ -187,10 +193,10 @@ module Wukong
187
193
  end
188
194
 
189
195
  def input_output_paths
190
- # input / output paths
191
- input_path, output_path = options.rest[0..1]
192
- raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_path.blank? || output_path.blank?)
193
- [input_path, output_path]
196
+ output_path = options.rest.pop
197
+ input_paths = options.rest.reject(&:blank?)
198
+ raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
199
+ [input_paths, output_path]
194
200
  end
195
201
 
196
202
  def maybe_overwrite_output_paths! output_path
@@ -218,8 +224,7 @@ module Wukong
218
224
  def ruby_interpreter_path
219
225
  Pathname.new(
220
226
  File.join(Config::CONFIG["bindir"],
221
- Config::CONFIG["RUBY_INSTALL_NAME"]+
222
- Config::CONFIG["EXEEXT"])
227
+ Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
223
228
  ).realpath
224
229
  end
225
230
 
@@ -229,10 +234,10 @@ module Wukong
229
234
  def exec_hadoop_streaming
230
235
  $stderr.puts "Streaming on self"
231
236
  input_path, output_path = input_output_paths
232
- maybe_overwrite_output_paths! output_path
233
237
  command = runner_command(input_path, output_path)
234
238
  $stderr.puts command
235
239
  unless options[:dry_run]
240
+ maybe_overwrite_output_paths! output_path
236
241
  $stdout.puts `#{command}`
237
242
  end
238
243
  end
@@ -28,6 +28,7 @@ module Wukong
28
28
  Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
29
29
  Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
30
30
  Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
31
+ Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
31
32
  # mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
32
33
 
33
34
  # emit a -jobconf hadoop option if the simplified command line arg is present
@@ -67,12 +68,13 @@ module Wukong
67
68
  ]
68
69
  end
69
70
 
70
- def hadoop_other_args
71
+ def hadoop_other_args input_path, output_path
71
72
  extra_str_args = [ options[:extra_args] ]
72
73
  extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
73
74
  options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
74
75
  options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
75
- extra_hsh_args = [:map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
76
+ options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
77
+ extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
76
78
  extra_str_args + extra_hsh_args
77
79
  end
78
80
 
@@ -105,7 +107,7 @@ module Wukong
105
107
  "-input '#{input_path}'",
106
108
  "-output '#{output_path}'",
107
109
  hadoop_recycle_env,
108
- hadoop_other_args,
110
+ hadoop_other_args(input_path, output_path),
109
111
  ].flatten.compact.join(" \t\\\n ")
110
112
  end
111
113