wukong 1.5.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'wukong'
5
+
6
+ require 'bloomfilter-rb'
7
+
8
+ SIZE = 2**24
9
+
10
+ class BucketCounter
11
+ def initialize(opts = {})
12
+ @opts = {
13
+ :size => 100,
14
+ :server => {}
15
+ }.merge opts
16
+ @db = ::Redis.new(@opts[:server])
17
+ @size = opts[:size]
18
+ end
19
+
20
+ def key_for val
21
+ (val.hash % @size)
22
+ end
23
+
24
+ def insert(val)
25
+ @db.incr(key_for(val))
26
+ end
27
+ alias :<< :insert
28
+
29
+ def delete(val)
30
+ if @db.decr(key_for(val)).to_i <= 0
31
+ @db.del(key_for(val))
32
+ end
33
+ end
34
+
35
+ def [](val)
36
+ @db.get(key_for(val)).to_i
37
+ end
38
+
39
+ def clear
40
+ @db.flushdb
41
+ end
42
+ end
43
+
44
+ bf = BucketCounter.new(:size => 1_000, :server => {:host => 'localhost'})
45
+ bf.clear
46
+ counts = Hash.new{|h,k| h[k] = 0 }
47
+
48
+ doc = File.read(__FILE__)
49
+ doc.split(/\W+/).each do |word|
50
+ counts[word] += 1
51
+ bf << word
52
+ end
53
+
54
+ counts.keys.sort.each do |word|
55
+ puts [ bf[word] - counts[word], bf[word], counts[word], word.hash % SIZE, word ].join("\t")
56
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'backports'
4
+ require 'backports/1.8.8'
5
+ require 'extlib'
6
+
7
+ class Source
8
+ # include Enumerable
9
+ attr_reader :streamer
10
+
11
+ def recordize line
12
+ # line.strip.split("\t")
13
+ [line[0..5]]
14
+ end
15
+
16
+ def each *args
17
+ $stdin.each(*args) do |raw_record|
18
+ record = recordize(raw_record)
19
+ next if record.blank?
20
+ yield *record
21
+ break if raw_record =~ /end/
22
+ end
23
+ end
24
+ end
25
+
26
+ # def process_group group
27
+ # end
28
+ #
29
+
30
+ class Streamer
31
+
32
+ def recordize line
33
+ [line[0..5]]
34
+ end
35
+
36
+ def each_group
37
+ while not $stdin.eof? do
38
+ Enumerator.new do |yielder|
39
+ $stdin.each do |line|
40
+ yield yielder
41
+ p yielder
42
+ break if line =~ /end/
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ foo = Streamer.new
50
+
51
+ foo.each_group do |group|
52
+ puts "hi"
53
+ p group.each do |line|
54
+ p line.reverse
55
+ end
56
+ # .map do |record|
57
+ # 1
58
+ # end
59
+ end
60
+
61
+
62
+ # i = 0
63
+ # # s = source.new(Streamer.new)
64
+ # $stdin.each do
65
+ # process_group do |output|
66
+ # puts output
67
+ # end
68
+ # $stderr.puts [Time.now, i] if (i += 1) % 10 == 0
69
+ # end
70
+
71
+
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << '/home/flip/ics/wukong/lib' # ENV['WUKONG_PATH']
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  # Given an adjacency pairs (from \t to) representation of a directed graph:
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << ENV['WUKONG_PATH']
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  # Use this script to do a Breadth-First Search (BFS) of a graph.
@@ -9,19 +9,18 @@ require 'wukong'
9
9
  # ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
10
10
  #
11
11
  # For example, given an edge list in the file '1path.tsv' that looks like
12
- # 1path n1 n2
13
- # 1path n1 n3
12
+ # 1path n1 n2
13
+ # 1path n1 n3
14
14
  # ... and so forth ...
15
15
  # you can run
16
16
  # for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
17
17
  # to do a 9-deep breadth-first search.
18
18
  #
19
19
  module Gen1HoodEdges
20
- class Mapper < Wukong::Streamer::Base
21
- attr_accessor :head, :tail
22
- def initialize options
23
- self.head = options[:head]
24
- self.tail = options[:tail]
20
+ class Mapper < Wukong::Streamer::RecordStreamer
21
+ def initialize
22
+ @head = Settings[:head]
23
+ @tail = Settings[:tail]
25
24
  end
26
25
  def process rsrc, *nodes
27
26
  yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
@@ -37,8 +36,8 @@ module Gen1HoodEdges
37
36
  #
38
37
  class Reducer < Wukong::Streamer::AccumulatingReducer
39
38
  attr_accessor :paths_in, :out_rsrc
40
- def initialize options
41
- self.out_rsrc = options[:out_rsrc]
39
+ def initialize
40
+ self.out_rsrc = Settings[:out_rsrc]
42
41
  end
43
42
  # clear the list of incoming paths
44
43
  def start! *args
@@ -63,17 +62,11 @@ module Gen1HoodEdges
63
62
  mid
64
63
  end
65
64
  end
66
-
67
- class Script < Wukong::Script
68
- def default_options
69
- super.merge :sort_fields => 2, :partition_fields => 1
70
- end
71
- end
72
-
73
65
  end
74
66
 
75
67
  # Execute the script
76
- Gen1HoodEdges::Script.new(
68
+ Wukong.run(
77
69
  Gen1HoodEdges::Mapper,
78
- Gen1HoodEdges::Reducer
79
- ).run
70
+ Gen1HoodEdges::Reducer,
71
+ :sort_fields => 2, :partition_fields => 1
72
+ )
@@ -2,7 +2,6 @@
2
2
  require 'rubygems'
3
3
  $: << File.dirname(__FILE__)+'/../../lib'
4
4
  require 'wukong'
5
- require 'wukong/models/graph'; include Wukong::Models
6
5
 
7
6
  #
8
7
  # Takes any number of flavors of directed edge with the form
@@ -88,17 +87,27 @@ module GenMultiEdge
88
87
  yield self.multi_edge
89
88
  end
90
89
  end
90
+ end
91
91
 
92
- #
93
- # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
94
- # reducer.
95
- #
96
- class Script < Wukong::Script
97
- def default_options
98
- super.merge :sort_fields => 2
99
- end
100
- end
92
+ Edge = TypedStruct.new(
93
+ [:src, Integer],
94
+ [:dest, Integer]
95
+ )
101
96
 
102
- # Execute the script
103
- Script.new(Mapper, Reducer).run
104
- end
97
+ MultiEdge = TypedStruct.new(
98
+ [:src, Integer],
99
+ [:dest, Integer],
100
+ [:a_follows_b, Integer],
101
+ [:b_follows_a, Integer],
102
+ [:a_replies_b, Integer],
103
+ [:b_replies_a, Integer],
104
+ [:a_atsigns_b, Integer],
105
+ [:b_atsigns_a, Integer],
106
+ [:a_retweets_b, Integer],
107
+ [:b_retweets_a, Integer],
108
+ [:a_favorites_b, Integer],
109
+ [:b_favorites_a, Integer]
110
+ )
111
+
112
+ # Execute the script
113
+ Script.new(Mapper, Reducer, :sort_fields => 2).run
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  #
6
6
  #
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../../lib'
3
- require 'wukong'
4
- require 'wukong/streamer/set_reducer'
3
+ require 'wukong/script'
4
+ require 'wukong/streamer/list_reducer'
5
5
 
6
6
  module PageRank
7
7
  class Script < Wukong::Script
@@ -15,10 +15,6 @@ module PageRank
15
15
  def map_command
16
16
  %Q{/usr/bin/cut -d"\t" -f2,3}
17
17
  end
18
-
19
- def default_options
20
- super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
21
- end
22
18
  end
23
19
 
24
20
  #
@@ -28,18 +24,18 @@ module PageRank
28
24
  #
29
25
  class Reducer < Wukong::Streamer::ListReducer
30
26
  def accumulate src, dest
31
- self.values << dest
27
+ @values << dest
32
28
  end
33
29
 
34
30
  # Emit src, initial pagerank, and flattened dests list
35
31
  def finalize
36
- self.values = ['dummy'] if self.values.blank?
37
- yield [key, 1.0, self.values.to_a.join(",")]
32
+ @values = ['dummy'] if @values.blank?
33
+ yield [key, 1.0, @values.to_a.join(",")]
38
34
  end
39
35
  end
40
36
 
41
37
  # Execute the script
42
- Script.new(nil, PageRank::Reducer).run
38
+ Script.new(nil, PageRank::Reducer, :io_sort_record_percent => 0.25).run
43
39
  end
44
40
 
45
41
 
@@ -1,7 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
4
+
5
+ Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
5
6
 
6
7
  #
7
8
  # Probabilistically emit some fraction of record/lines
@@ -14,30 +15,19 @@ require 'wukong'
14
15
  class Mapper < Wukong::Streamer::LineStreamer
15
16
  include Wukong::Streamer::Filter
16
17
 
17
- #
18
- # floating-point number between 0 and 1 giving the fraction of lines to emit:
19
- # at sampling_fraction=1 all records are emitted, at 0 none are.
20
- #
21
- # Takes its value from a mandatory command-line option
22
- #
23
- def sampling_fraction
24
- @sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
25
- raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
26
- end
27
-
28
18
  #
29
19
  # randomly decide to emit +sampling_fraction+ fraction of lines
30
20
  #
31
21
  def emit? line
32
- rand < self.sampling_fraction
22
+ rand < Settings.sampling_fraction
33
23
  end
34
24
  end
35
25
 
36
26
  #
37
27
  # Executes the script
38
28
  #
39
- Wukong::Script.new( Mapper,
29
+ Wukong.run( Mapper,
40
30
  nil,
41
31
  :reduce_tasks => 0,
42
32
  :reuse_jvms => true
43
- ).run
33
+ )
@@ -1,22 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
2
  require 'rubygems'
4
- require 'wukong'
3
+ require 'wukong/script'
5
4
 
6
- MONTHS = {
7
- 'Jan' => '01',
8
- 'Feb' => '02',
9
- 'Mar' => '03',
10
- 'Apr' => '04',
11
- 'May' => '05',
12
- 'Jun' => '06',
13
- 'Jul' => '07',
14
- 'Aug' => '08',
15
- 'Sep' => '09',
16
- 'Oct' => '10',
17
- 'Nov' => '11',
18
- 'Dec' => '12',
19
- }
20
5
  module ApacheLogParser
21
6
  class Mapper < Wukong::Streamer::LineStreamer
22
7
 
@@ -40,6 +25,7 @@ module ApacheLogParser
40
25
  \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
41
26
  \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
42
27
  \z}x)
28
+ MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
43
29
 
44
30
  # Use the regex to break line into fields
45
31
  # Emit each record as flat line
@@ -49,14 +35,13 @@ module ApacheLogParser
49
35
  if m
50
36
  (ip, j1, j2,
51
37
  ts_day, ts_mo, ts_year,
52
- ts_hour, ts_min, ts_sec, req_tz,
38
+ ts_hour, ts_min, ts_sec, tz,
53
39
  http_method, path, protocol,
54
40
  response_code, duration,
55
41
  referer, ua, *cruft) = m.captures
56
- # DateTime.parse("#{datepart} #{timepart}").to_flat # this takes way too long
57
- req_date = [ts_year, MONTHS[ts_mo], ts_day].join("")
58
- req_time = [ts_hour, ts_min, ts_sec].join("")
59
- yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
42
+ date = [ts_year, MONTHS[ts_mo], ts_day].join("")
43
+ time = [ts_hour, ts_min, ts_sec].join("")
44
+ yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
60
45
  else
61
46
  yield [:unparseable, line]
62
47
  end
@@ -65,7 +50,7 @@ module ApacheLogParser
65
50
  end
66
51
  end
67
52
 
68
- Wukong::Script.new(ApacheLogParser::Mapper, nil, :sort_fields => 7).run
53
+ Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
69
54
 
70
55
  # 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
71
56
 
@@ -1,3 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ class Logline < Struct.new(
6
+ :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
7
+
8
+ def page_type
9
+ case
10
+ when path =~ /\.(css|js)$/ then :asset
11
+ when path =~ /\.(png|gif|ico)$/ then :image
12
+ when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
13
+ else :other
14
+ end
15
+ end
16
+
17
+ def is_page?
18
+ page_type == :page
19
+ end
20
+
21
+ def day_hr
22
+ visit.date + visit.time[0..1]
23
+ end
24
+ end
25
+
26
+
1
27
  #
2
28
  # Group all visitors, and then troll through all the pages they've visited
3
29
  # breaking each into distinct visits (where more than an [hour|day|whatever]
@@ -12,6 +38,11 @@
12
38
  #
13
39
  # where the partition key is visitor_id, and we sort by visitor_id and datetime.
14
40
  #
41
+ class VisitorDatePath < Wukong::Streamer::StructStreamer
42
+ def process visit, *args
43
+ yield [visit.ip, visit.day_hr, visit.path]
44
+ end
45
+ end
15
46
 
16
47
  #
17
48
  # Reducer:
@@ -34,3 +65,11 @@
34
65
  # page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
35
66
  #
36
67
  # to discover all trails passing through a given page.
68
+ class VisitorDatePath < Wukong::Streamer::Reducer
69
+ def get_key ip, day_hr, path, *args
70
+ [ip, day_hr]
71
+ end
72
+ def process_group visit, *args
73
+ yield [visit.ip, visit.day_hr, visit.path]
74
+ end
75
+ end
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'wukong/script'
4
+
5
+ class Logline < Struct.new(
6
+ :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
7
+
8
+ def page_type
9
+ case
10
+ when path =~ /\.(css|js)$/ then :asset
11
+ when path =~ /\.(png|gif|ico)$/ then :image
12
+ when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
13
+ else :other
14
+ end
15
+ end
16
+
17
+ def is_page?
18
+ page_type == :page
19
+ end
20
+ end
21
+
22
+ class PageFilter < Wukong::Streamer::StructStreamer
23
+ def process visit, *args
24
+ yield visit.ua if visit.
25
+ end
26
+ end
27
+ Wukong.run(PageFilter)
data/examples/size.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
3
+ require 'wukong/script'
4
4
 
5
5
  module Size
6
6
  #
@@ -56,5 +56,6 @@ end
56
56
  # Execute the script
57
57
  Size::Script.new(
58
58
  nil,
59
- Size::Reducer
59
+ Size::Reducer,
60
+ :reduce_tasks => 1
60
61
  ).run
@@ -1,10 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
-
3
- require 'rubygems'
4
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
5
4
  require 'wukong/streamer/count_keys'
6
5
 
7
-
8
6
  #
9
7
  # Ch3ck out dis moist azz code bitches!!
10
8
  #
@@ -70,14 +68,14 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
70
68
  table << "TRSTRANK_TABLE = " << count_bin.inspect
71
69
  table.close
72
70
  end
73
-
71
+
74
72
  #
75
- # Return percentile of a given trstrank for a given follower bracket
73
+ # Return percentile of a given trstrank for a given follower bracket
76
74
  #
77
75
  def percentile bin, rank
78
- ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
76
+ ((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
79
77
  end
80
-
78
+
81
79
  #
82
80
  # Return the count of values less than rank
83
81
  #
@@ -119,7 +117,7 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
119
117
  big_list.uniq.sort{|x,y| x.first <=> y.first}
120
118
  end
121
119
 
122
-
120
+
123
121
  #
124
122
  # Nothing to see here, move along
125
123
  #
@@ -132,11 +130,11 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
132
130
  num.times do |i|
133
131
  x = pair1.first + (i+1).to_f*dx
134
132
  y = m*x + b
135
- points << [x,y]
133
+ points << [x,y]
136
134
  end
137
135
  points # return an array of pairs
138
136
  end
139
-
137
+
140
138
  end
141
139
 
142
140
  Wukong::Script.new(Mapper,Reducer).run
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
4
4
  require 'wukong/streamer/rank_and_bin_reducer'
5
5
 
6
6
  #
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong'
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong/script'
4
4
 
5
5
  # Run as (local mode)
6
6
  #
@@ -15,14 +15,14 @@ require 'wukong'
15
15
  # cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
16
16
  #
17
17
 
18
- #
19
- # A very simple mapper -- looks for a regex match in one field,
20
- # and emits the whole record if the field matches
21
- #
22
- class GrepMapper < Wukong::Streamer::RecordStreamer
23
-
18
+ class Mapper < LineStreamer
19
+ include Filter
24
20
  MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
25
21
 
22
+ #
23
+ # A very simple mapper -- looks for a regex match in one field,
24
+ # and emits the whole record if the field matches
25
+ #
26
26
  #
27
27
  # Given a series of records like:
28
28
  #
@@ -31,13 +31,10 @@ class GrepMapper < Wukong::Streamer::RecordStreamer
31
31
  #
32
32
  # emits only the lines matching that regex
33
33
  #
34
- def process rsrc, id, timestamp, text, *rest
35
- yield [rsrc, id, timestamp, text, *rest] if line =~ MATCHER
34
+ def emit? line
35
+ MATCHER.match line
36
36
  end
37
37
  end
38
38
 
39
39
  # Execute the script
40
- Wukong::Script.new(
41
- GrepMapper,
42
- nil
43
- ).run
40
+ Wukong.run(Mapper)