wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'wukong'
5
+
6
+ require 'bloomfilter-rb'
7
+
8
+ SIZE = 2**24
9
+
10
+ class BucketCounter
11
+ def initialize(opts = {})
12
+ @opts = {
13
+ :size => 100,
14
+ :server => {}
15
+ }.merge opts
16
+ @db = ::Redis.new(@opts[:server])
17
+ @size = opts[:size]
18
+ end
19
+
20
+ def key_for val
21
+ (val.hash % @size)
22
+ end
23
+
24
+ def insert(val)
25
+ @db.incr(key_for(val))
26
+ end
27
+ alias :<< :insert
28
+
29
+ def delete(val)
30
+ if @db.decr(key_for(val)).to_i <= 0
31
+ @db.del(key_for(val))
32
+ end
33
+ end
34
+
35
+ def [](val)
36
+ @db.get(key_for(val)).to_i
37
+ end
38
+
39
+ def clear
40
+ @db.flushdb
41
+ end
42
+ end
43
+
44
+ bf = BucketCounter.new(:size => 1_000, :server => {:host => 'localhost'})
45
+ bf.clear
46
+ counts = Hash.new{|h,k| h[k] = 0 }
47
+
48
+ doc = File.read(__FILE__)
49
+ doc.split(/\W+/).each do |word|
50
+ counts[word] += 1
51
+ bf << word
52
+ end
53
+
54
+ counts.keys.sort.each do |word|
55
+ puts [ bf[word] - counts[word], bf[word], counts[word], word.hash % SIZE, word ].join("\t")
56
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'backports'
4
+ require 'backports/1.8.8'
5
+ require 'extlib'
6
+
7
+ class Source
8
+ # include Enumerable
9
+ attr_reader :streamer
10
+
11
+ def recordize line
12
+ # line.strip.split("\t")
13
+ [line[0..5]]
14
+ end
15
+
16
+ def each *args
17
+ $stdin.each(*args) do |raw_record|
18
+ record = recordize(raw_record)
19
+ next if record.blank?
20
+ yield *record
21
+ break if raw_record =~ /end/
22
+ end
23
+ end
24
+ end
25
+
26
+ # def process_group group
27
+ # end
28
+ #
29
+
30
+ class Streamer
31
+
32
+ def recordize line
33
+ [line[0..5]]
34
+ end
35
+
36
+ def each_group
37
+ while not $stdin.eof? do
38
+ Enumerator.new do |yielder|
39
+ $stdin.each do |line|
40
+ yield yielder
41
+ p yielder
42
+ break if line =~ /end/
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ foo = Streamer.new
50
+
51
+ foo.each_group do |group|
52
+ puts "hi"
53
+ p group.each do |line|
54
+ p line.reverse
55
+ end
56
+ # .map do |record|
57
+ # 1
58
+ # end
59
+ end
60
+
61
+
62
+ # i = 0
63
+ # # s = source.new(Streamer.new)
64
+ # $stdin.each do
65
+ # process_group do |output|
66
+ # puts output
67
+ # end
68
+ # $stderr.puts [Time.now, i] if (i += 1) % 10 == 0
69
+ # end
70
+
71
+
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  # Given an adjacency pairs (from \t to) representation of a directed graph:
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << ENV['WUKONG_PATH']
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  # Use this script to do a Breadth-First Search (BFS) of a graph.
@@ -9,19 +9,18 @@ require 'wukong'
9
9
  # ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
10
10
  #
11
11
  # For example, given an edge list in the file '1path.tsv' that looks like
12
- # 1path n1 n2
13
- # 1path n1 n3
12
+ # 1path n1 n2
13
+ # 1path n1 n3
14
14
  # ... and so forth ...
15
15
  # you can run
16
16
  # for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
17
17
  # to do a 9-deep breadth-first search.
18
18
  #
19
19
  module Gen1HoodEdges
20
- class Mapper < Wukong::Streamer::Base
21
- attr_accessor :head, :tail
22
- def initialize options
23
- self.head = options[:head]
24
- self.tail = options[:tail]
20
+ class Mapper < Wukong::Streamer::RecordStreamer
21
+ def initialize
22
+ @head = Settings[:head]
23
+ @tail = Settings[:tail]
25
24
  end
26
25
  def process rsrc, *nodes
27
26
  yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
@@ -37,8 +36,8 @@ module Gen1HoodEdges
37
36
  #
38
37
  class Reducer < Wukong::Streamer::AccumulatingReducer
39
38
  attr_accessor :paths_in, :out_rsrc
40
- def initialize options
41
- self.out_rsrc = options[:out_rsrc]
39
+ def initialize
40
+ self.out_rsrc = Settings[:out_rsrc]
42
41
  end
43
42
  # clear the list of incoming paths
44
43
  def start! *args
@@ -63,17 +62,11 @@ module Gen1HoodEdges
63
62
  mid
64
63
  end
65
64
  end
66
-
67
- class Script < Wukong::Script
68
- def default_options
69
- super.merge :sort_fields => 2, :partition_fields => 1
70
- end
71
- end
72
-
73
65
  end
74
66
 
75
67
  # Execute the script
76
- Gen1HoodEdges::Script.new(
68
+ Wukong.run(
77
69
  Gen1HoodEdges::Mapper,
78
- Gen1HoodEdges::Reducer
79
- ).run
70
+ Gen1HoodEdges::Reducer,
71
+ :sort_fields => 2, :partition_fields => 1
72
+ )
@@ -2,7 +2,6 @@
2
2
  require 'rubygems'
3
3
  $: << File.dirname(__FILE__)+'/../../lib'
4
4
  require 'wukong'
5
- require 'wukong/models/graph'; include Wukong::Models
6
5
 
7
6
  #
8
7
  # Takes any number of flavors of directed edge with the form
@@ -88,17 +87,27 @@ module GenMultiEdge
88
87
  yield self.multi_edge
89
88
  end
90
89
  end
90
+ end
91
91
 
92
- #
93
- # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
94
- # reducer.
95
- #
96
- class Script < Wukong::Script
97
- def default_options
98
- super.merge :sort_fields => 2
99
- end
100
- end
92
+ Edge = TypedStruct.new(
93
+ [:src, Integer],
94
+ [:dest, Integer]
95
+ )
101
96
 
102
- # Execute the script
103
- Script.new(Mapper, Reducer).run
104
- end
97
+ MultiEdge = TypedStruct.new(
98
+ [:src, Integer],
99
+ [:dest, Integer],
100
+ [:a_follows_b, Integer],
101
+ [:b_follows_a, Integer],
102
+ [:a_replies_b, Integer],
103
+ [:b_replies_a, Integer],
104
+ [:a_atsigns_b, Integer],
105
+ [:b_atsigns_a, Integer],
106
+ [:a_retweets_b, Integer],
107
+ [:b_retweets_a, Integer],
108
+ [:a_favorites_b, Integer],
109
+ [:b_favorites_a, Integer]
110
+ )
111
+
112
+ # Execute the script
113
+ Script.new(Mapper, Reducer, :sort_fields => 2).run
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  #
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
4
- require 'wukong/streamer/set_reducer'
3
+ require 'wukong/script'
4
+ require 'wukong/streamer/list_reducer'
5
5
 
6
6
  module PageRank
7
7
  class Script < Wukong::Script
@@ -15,10 +15,6 @@ module PageRank
15
15
  def map_command
16
16
  %Q{/usr/bin/cut -d"\t" -f2,3}
17
17
  end
18
-
19
- def default_options
20
- super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
21
- end
22
18
  end
23
19
 
24
20
  #
@@ -28,18 +24,18 @@ module PageRank
28
24
  #
29
25
  class Reducer < Wukong::Streamer::ListReducer
30
26
  def accumulate src, dest
31
- self.values << dest
27
+ @values << dest
32
28
  end
33
29
 
34
30
  # Emit src, initial pagerank, and flattened dests list
35
31
  def finalize
36
- self.values = ['dummy'] if self.values.blank?
37
- yield [key, 1.0, self.values.to_a.join(",")]
32
+ @values = ['dummy'] if @values.blank?
33
+ yield [key, 1.0, @values.to_a.join(",")]
38
34
  end
39
35
  end
40
36
 
41
37
  # Execute the script
42
- Script.new(nil, PageRank::Reducer).run
38
+ Script.new(nil, PageRank::Reducer, :io_sort_record_percent => 0.25).run
43
39
  end
44
40
 
45
41
 
@@ -1,7 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
4
+
5
+ Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
5
6
 
6
7
  #
7
8
  # Probabilistically emit some fraction of record/lines
@@ -14,30 +15,19 @@ require 'wukong'
14
15
  class Mapper < Wukong::Streamer::LineStreamer
15
16
  include Wukong::Streamer::Filter
16
17
 
17
- #
18
- # floating-point number between 0 and 1 giving the fraction of lines to emit:
19
- # at sampling_fraction=1 all records are emitted, at 0 none are.
20
- #
21
- # Takes its value from a mandatory command-line option
22
- #
23
- def sampling_fraction
24
- @sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
25
- raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
26
- end
27
-
28
18
  #
29
19
  # randomly decide to emit +sampling_fraction+ fraction of lines
30
20
  #
31
21
  def emit? line
32
- rand < self.sampling_fraction
22
+ rand < Settings.sampling_fraction
33
23
  end
34
24
  end
35
25
 
36
26
  #
37
27
  # Executes the script
38
28
  #
39
- Wukong::Script.new( Mapper,
29
+ Wukong.run( Mapper,
40
30
  nil,
41
31
  :reduce_tasks => 0,
42
32
  :reuse_jvms => true
43
- ).run
33
+ )
@@ -1,22 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
5
4
 
6
- MONTHS = {
7
- 'Jan' => '01',
8
- 'Feb' => '02',
9
- 'Mar' => '03',
10
- 'Apr' => '04',
11
- 'May' => '05',
12
- 'Jun' => '06',
13
- 'Jul' => '07',
14
- 'Aug' => '08',
15
- 'Sep' => '09',
16
- 'Oct' => '10',
17
- 'Nov' => '11',
18
- 'Dec' => '12',
19
- }
20
5
  module ApacheLogParser
21
6
  class Mapper < Wukong::Streamer::LineStreamer
22
7
 
@@ -40,6 +25,7 @@ module ApacheLogParser
40
25
  \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
41
26
  \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
42
27
  \z}x)
28
+ MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
43
29
 
44
30
  # Use the regex to break line into fields
45
31
  # Emit each record as flat line
@@ -49,14 +35,13 @@ module ApacheLogParser
49
35
  if m
50
36
  (ip, j1, j2,
51
37
  ts_day, ts_mo, ts_year,
52
- ts_hour, ts_min, ts_sec, req_tz,
38
+ ts_hour, ts_min, ts_sec, tz,
53
39
  http_method, path, protocol,
54
40
  response_code, duration,
55
41
  referer, ua, *cruft) = m.captures
56
- # DateTime.parse("#{datepart} #{timepart}").to_flat # this takes way too long
57
- req_date = [ts_year, MONTHS[ts_mo], ts_day].join("")
58
- req_time = [ts_hour, ts_min, ts_sec].join("")
59
- yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
42
+ date = [ts_year, MONTHS[ts_mo], ts_day].join("")
43
+ time = [ts_hour, ts_min, ts_sec].join("")
44
+ yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
60
45
  else
61
46
  yield [:unparseable, line]
62
47
  end
@@ -65,7 +50,7 @@ module ApacheLogParser
65
50
  end
66
51
  end
67
52
 
68
- Wukong::Script.new(ApacheLogParser::Mapper, nil, :sort_fields => 7).run
53
+ Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
69
54
 
70
55
  # 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
71
56
 
@@ -1,3 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ class Logline < Struct.new(
6
+ :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
7
+
8
+ def page_type
9
+ case
10
+ when path =~ /\.(css|js)$/ then :asset
11
+ when path =~ /\.(png|gif|ico)$/ then :image
12
+ when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
13
+ else :other
14
+ end
15
+ end
16
+
17
+ def is_page?
18
+ page_type == :page
19
+ end
20
+
21
+ def day_hr
22
+ visit.date + visit.time[0..1]
23
+ end
24
+ end
25
+
26
+
1
27
  #
2
28
  # Group all visitors, and then troll through all the pages they've visited
3
29
  # breaking each into distinct visits (where more than an [hour|day|whatever]
@@ -12,6 +38,11 @@
12
38
  #
13
39
  # where the partition key is visitor_id, and we sort by visitor_id and datetime.
14
40
  #
41
+ class VisitorDatePath < Wukong::Streamer::StructStreamer
42
+ def process visit, *args
43
+ yield [visit.ip, visit.day_hr, visit.path]
44
+ end
45
+ end
15
46
 
16
47
  #
17
48
  # Reducer:
@@ -34,3 +65,11 @@
34
65
  # page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
35
66
  #
36
67
  # to discover all trails passing through a given page.
68
+ class VisitorDatePath < Wukong::Streamer::Reducer
69
+ def get_key ip, day_hr, path, *args
70
+ [ip, day_hr]
71
+ end
72
+ def process_group visit, *args
73
+ yield [visit.ip, visit.day_hr, visit.path]
74
+ end
75
+ end
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ class Logline < Struct.new(
6
+ :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
7
+
8
+ def page_type
9
+ case
10
+ when path =~ /\.(css|js)$/ then :asset
11
+ when path =~ /\.(png|gif|ico)$/ then :image
12
+ when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
13
+ else :other
14
+ end
15
+ end
16
+
17
+ def is_page?
18
+ page_type == :page
19
+ end
20
+ end
21
+
22
+ class PageFilter < Wukong::Streamer::StructStreamer
23
+ def process visit, *args
24
+ yield visit.ua if visit.
25
+ end
26
+ end
27
+ Wukong.run(PageFilter)
data/examples/size.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  module Size
6
6
  #
@@ -56,5 +56,6 @@ end
56
56
  # Execute the script
57
57
  Size::Script.new(
58
58
  nil,
59
- Size::Reducer
59
+ Size::Reducer,
60
+ :reduce_tasks => 1
60
61
  ).run
@@ -1,10 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- require 'rubygems'
4
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
5
4
  require 'wukong/streamer/count_keys'
6
5
 
7
-
8
6
  #
9
7
  # Ch3ck out dis moist azz code bitches!!
10
8
  #
@@ -70,14 +68,14 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
70
68
  table << "TRSTRANK_TABLE = " << count_bin.inspect
71
69
  table.close
72
70
  end
73
-
71
+
74
72
  #
75
- # Return percentile of a given trstrank for a given follower bracket
73
+ # Return percentile of a given trstrank for a given follower bracket
76
74
  #
77
75
  def percentile bin, rank
78
- ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
76
+ ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
79
77
  end
80
-
78
+
81
79
  #
82
80
  # Return the count of values less than rank
83
81
  #
@@ -119,7 +117,7 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
119
117
  big_list.uniq.sort{|x,y| x.first <=> y.first}
120
118
  end
121
119
 
122
-
120
+
123
121
  #
124
122
  # Nothing to see here, move along
125
123
  #
@@ -132,11 +130,11 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
132
130
  num.times do |i|
133
131
  x = pair1.first + (i+1).to_f*dx
134
132
  y = m*x + b
135
- points << [x,y]
133
+ points << [x,y]
136
134
  end
137
135
  points # return an array of pairs
138
136
  end
139
-
137
+
140
138
  end
141
139
 
142
140
  Wukong::Script.new(Mapper,Reducer).run
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
4
4
  require 'wukong/streamer/rank_and_bin_reducer'
5
5
 
6
6
  #
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  # Run as (local mode)
6
6
  #
@@ -15,14 +15,14 @@ require 'wukong'
15
15
  # cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
16
16
  #
17
17
 
18
- #
19
- # A very simple mapper -- looks for a regex match in one field,
20
- # and emits the whole record if the field matches
21
- #
22
- class GrepMapper < Wukong::Streamer::RecordStreamer
23
-
18
+ class Mapper < LineStreamer
19
+ include Filter
24
20
  MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
25
21
 
22
+ #
23
+ # A very simple mapper -- looks for a regex match in one field,
24
+ # and emits the whole record if the field matches
25
+ #
26
26
  #
27
27
  # Given a series of records like:
28
28
  #
@@ -31,13 +31,10 @@ class GrepMapper < Wukong::Streamer::RecordStreamer
31
31
  #
32
32
  # emits only the lines matching that regex
33
33
  #
34
- def process rsrc, id, timestamp, text, *rest
35
- yield [rsrc, id, timestamp, text, *rest] if line =~ MATCHER
34
+ def emit? line
35
+ MATCHER.match line
36
36
  end
37
37
  end
38
38
 
39
39
  # Execute the script
40
- Wukong::Script.new(
41
- GrepMapper,
42
- nil
43
- ).run
40
+ Wukong.run(Mapper)