wukong 1.5.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
require 'bloomfilter-rb'
|
7
|
+
|
8
|
+
SIZE = 2**24
|
9
|
+
|
10
|
+
class BucketCounter
|
11
|
+
def initialize(opts = {})
|
12
|
+
@opts = {
|
13
|
+
:size => 100,
|
14
|
+
:server => {}
|
15
|
+
}.merge opts
|
16
|
+
@db = ::Redis.new(@opts[:server])
|
17
|
+
@size = opts[:size]
|
18
|
+
end
|
19
|
+
|
20
|
+
def key_for val
|
21
|
+
(val.hash % @size)
|
22
|
+
end
|
23
|
+
|
24
|
+
def insert(val)
|
25
|
+
@db.incr(key_for(val))
|
26
|
+
end
|
27
|
+
alias :<< :insert
|
28
|
+
|
29
|
+
def delete(val)
|
30
|
+
if @db.decr(key_for(val)).to_i <= 0
|
31
|
+
@db.del(key_for(val))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def [](val)
|
36
|
+
@db.get(key_for(val)).to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear
|
40
|
+
@db.flushdb
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
bf = BucketCounter.new(:size => 1_000, :server => {:host => 'localhost'})
|
45
|
+
bf.clear
|
46
|
+
counts = Hash.new{|h,k| h[k] = 0 }
|
47
|
+
|
48
|
+
doc = File.read(__FILE__)
|
49
|
+
doc.split(/\W+/).each do |word|
|
50
|
+
counts[word] += 1
|
51
|
+
bf << word
|
52
|
+
end
|
53
|
+
|
54
|
+
counts.keys.sort.each do |word|
|
55
|
+
puts [ bf[word] - counts[word], bf[word], counts[word], word.hash % SIZE, word ].join("\t")
|
56
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'backports'
|
4
|
+
require 'backports/1.8.8'
|
5
|
+
require 'extlib'
|
6
|
+
|
7
|
+
class Source
|
8
|
+
# include Enumerable
|
9
|
+
attr_reader :streamer
|
10
|
+
|
11
|
+
def recordize line
|
12
|
+
# line.strip.split("\t")
|
13
|
+
[line[0..5]]
|
14
|
+
end
|
15
|
+
|
16
|
+
def each *args
|
17
|
+
$stdin.each(*args) do |raw_record|
|
18
|
+
record = recordize(raw_record)
|
19
|
+
next if record.blank?
|
20
|
+
yield *record
|
21
|
+
break if raw_record =~ /end/
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# def process_group group
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
|
30
|
+
class Streamer
|
31
|
+
|
32
|
+
def recordize line
|
33
|
+
[line[0..5]]
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_group
|
37
|
+
while not $stdin.eof? do
|
38
|
+
Enumerator.new do |yielder|
|
39
|
+
$stdin.each do |line|
|
40
|
+
yield yielder
|
41
|
+
p yielder
|
42
|
+
break if line =~ /end/
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
foo = Streamer.new
|
50
|
+
|
51
|
+
foo.each_group do |group|
|
52
|
+
puts "hi"
|
53
|
+
p group.each do |line|
|
54
|
+
p line.reverse
|
55
|
+
end
|
56
|
+
# .map do |record|
|
57
|
+
# 1
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# i = 0
|
63
|
+
# # s = source.new(Streamer.new)
|
64
|
+
# $stdin.each do
|
65
|
+
# process_group do |output|
|
66
|
+
# puts output
|
67
|
+
# end
|
68
|
+
# $stderr.puts [Time.now, i] if (i += 1) % 10 == 0
|
69
|
+
# end
|
70
|
+
|
71
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: <<
|
3
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
#
|
6
6
|
# Use this script to do a Breadth-First Search (BFS) of a graph.
|
@@ -9,19 +9,18 @@ require 'wukong'
|
|
9
9
|
# ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
|
10
10
|
#
|
11
11
|
# For example, given an edge list in the file '1path.tsv' that looks like
|
12
|
-
# 1path
|
13
|
-
# 1path n1
|
12
|
+
# 1path n1 n2
|
13
|
+
# 1path n1 n3
|
14
14
|
# ... and so forth ...
|
15
15
|
# you can run
|
16
16
|
# for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
|
17
17
|
# to do a 9-deep breadth-first search.
|
18
18
|
#
|
19
19
|
module Gen1HoodEdges
|
20
|
-
class Mapper < Wukong::Streamer::
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
self.tail = options[:tail]
|
20
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
21
|
+
def initialize
|
22
|
+
@head = Settings[:head]
|
23
|
+
@tail = Settings[:tail]
|
25
24
|
end
|
26
25
|
def process rsrc, *nodes
|
27
26
|
yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
|
@@ -37,8 +36,8 @@ module Gen1HoodEdges
|
|
37
36
|
#
|
38
37
|
class Reducer < Wukong::Streamer::AccumulatingReducer
|
39
38
|
attr_accessor :paths_in, :out_rsrc
|
40
|
-
def initialize
|
41
|
-
self.out_rsrc =
|
39
|
+
def initialize
|
40
|
+
self.out_rsrc = Settings[:out_rsrc]
|
42
41
|
end
|
43
42
|
# clear the list of incoming paths
|
44
43
|
def start! *args
|
@@ -63,17 +62,11 @@ module Gen1HoodEdges
|
|
63
62
|
mid
|
64
63
|
end
|
65
64
|
end
|
66
|
-
|
67
|
-
class Script < Wukong::Script
|
68
|
-
def default_options
|
69
|
-
super.merge :sort_fields => 2, :partition_fields => 1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
65
|
end
|
74
66
|
|
75
67
|
# Execute the script
|
76
|
-
|
68
|
+
Wukong.run(
|
77
69
|
Gen1HoodEdges::Mapper,
|
78
|
-
Gen1HoodEdges::Reducer
|
79
|
-
|
70
|
+
Gen1HoodEdges::Reducer,
|
71
|
+
:sort_fields => 2, :partition_fields => 1
|
72
|
+
)
|
@@ -2,7 +2,6 @@
|
|
2
2
|
require 'rubygems'
|
3
3
|
$: << File.dirname(__FILE__)+'/../../lib'
|
4
4
|
require 'wukong'
|
5
|
-
require 'wukong/models/graph'; include Wukong::Models
|
6
5
|
|
7
6
|
#
|
8
7
|
# Takes any number of flavors of directed edge with the form
|
@@ -88,17 +87,27 @@ module GenMultiEdge
|
|
88
87
|
yield self.multi_edge
|
89
88
|
end
|
90
89
|
end
|
90
|
+
end
|
91
91
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
class Script < Wukong::Script
|
97
|
-
def default_options
|
98
|
-
super.merge :sort_fields => 2
|
99
|
-
end
|
100
|
-
end
|
92
|
+
Edge = TypedStruct.new(
|
93
|
+
[:src, Integer],
|
94
|
+
[:dest, Integer]
|
95
|
+
)
|
101
96
|
|
102
|
-
|
103
|
-
|
104
|
-
|
97
|
+
MultiEdge = TypedStruct.new(
|
98
|
+
[:src, Integer],
|
99
|
+
[:dest, Integer],
|
100
|
+
[:a_follows_b, Integer],
|
101
|
+
[:b_follows_a, Integer],
|
102
|
+
[:a_replies_b, Integer],
|
103
|
+
[:b_replies_a, Integer],
|
104
|
+
[:a_atsigns_b, Integer],
|
105
|
+
[:b_atsigns_a, Integer],
|
106
|
+
[:a_retweets_b, Integer],
|
107
|
+
[:b_retweets_a, Integer],
|
108
|
+
[:a_favorites_b, Integer],
|
109
|
+
[:b_favorites_a, Integer]
|
110
|
+
)
|
111
|
+
|
112
|
+
# Execute the script
|
113
|
+
Script.new(Mapper, Reducer, :sort_fields => 2).run
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
-
require 'wukong'
|
4
|
-
require 'wukong/streamer/
|
3
|
+
require 'wukong/script'
|
4
|
+
require 'wukong/streamer/list_reducer'
|
5
5
|
|
6
6
|
module PageRank
|
7
7
|
class Script < Wukong::Script
|
@@ -15,10 +15,6 @@ module PageRank
|
|
15
15
|
def map_command
|
16
16
|
%Q{/usr/bin/cut -d"\t" -f2,3}
|
17
17
|
end
|
18
|
-
|
19
|
-
def default_options
|
20
|
-
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
21
|
-
end
|
22
18
|
end
|
23
19
|
|
24
20
|
#
|
@@ -28,18 +24,18 @@ module PageRank
|
|
28
24
|
#
|
29
25
|
class Reducer < Wukong::Streamer::ListReducer
|
30
26
|
def accumulate src, dest
|
31
|
-
|
27
|
+
@values << dest
|
32
28
|
end
|
33
29
|
|
34
30
|
# Emit src, initial pagerank, and flattened dests list
|
35
31
|
def finalize
|
36
|
-
|
37
|
-
yield [key, 1.0,
|
32
|
+
@values = ['dummy'] if @values.blank?
|
33
|
+
yield [key, 1.0, @values.to_a.join(",")]
|
38
34
|
end
|
39
35
|
end
|
40
36
|
|
41
37
|
# Execute the script
|
42
|
-
Script.new(nil, PageRank::Reducer).run
|
38
|
+
Script.new(nil, PageRank::Reducer, :io_sort_record_percent => 0.25).run
|
43
39
|
end
|
44
40
|
|
45
41
|
|
data/examples/sample_records.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../lib'
|
3
2
|
require 'rubygems'
|
4
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
|
5
6
|
|
6
7
|
#
|
7
8
|
# Probabilistically emit some fraction of record/lines
|
@@ -14,30 +15,19 @@ require 'wukong'
|
|
14
15
|
class Mapper < Wukong::Streamer::LineStreamer
|
15
16
|
include Wukong::Streamer::Filter
|
16
17
|
|
17
|
-
#
|
18
|
-
# floating-point number between 0 and 1 giving the fraction of lines to emit:
|
19
|
-
# at sampling_fraction=1 all records are emitted, at 0 none are.
|
20
|
-
#
|
21
|
-
# Takes its value from a mandatory command-line option
|
22
|
-
#
|
23
|
-
def sampling_fraction
|
24
|
-
@sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
|
25
|
-
raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
|
26
|
-
end
|
27
|
-
|
28
18
|
#
|
29
19
|
# randomly decide to emit +sampling_fraction+ fraction of lines
|
30
20
|
#
|
31
21
|
def emit? line
|
32
|
-
rand <
|
22
|
+
rand < Settings.sampling_fraction
|
33
23
|
end
|
34
24
|
end
|
35
25
|
|
36
26
|
#
|
37
27
|
# Executes the script
|
38
28
|
#
|
39
|
-
Wukong
|
29
|
+
Wukong.run( Mapper,
|
40
30
|
nil,
|
41
31
|
:reduce_tasks => 0,
|
42
32
|
:reuse_jvms => true
|
43
|
-
)
|
33
|
+
)
|
@@ -1,22 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../lib'
|
3
2
|
require 'rubygems'
|
4
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
5
4
|
|
6
|
-
MONTHS = {
|
7
|
-
'Jan' => '01',
|
8
|
-
'Feb' => '02',
|
9
|
-
'Mar' => '03',
|
10
|
-
'Apr' => '04',
|
11
|
-
'May' => '05',
|
12
|
-
'Jun' => '06',
|
13
|
-
'Jul' => '07',
|
14
|
-
'Aug' => '08',
|
15
|
-
'Sep' => '09',
|
16
|
-
'Oct' => '10',
|
17
|
-
'Nov' => '11',
|
18
|
-
'Dec' => '12',
|
19
|
-
}
|
20
5
|
module ApacheLogParser
|
21
6
|
class Mapper < Wukong::Streamer::LineStreamer
|
22
7
|
|
@@ -40,6 +25,7 @@ module ApacheLogParser
|
|
40
25
|
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
41
26
|
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
42
27
|
\z}x)
|
28
|
+
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
43
29
|
|
44
30
|
# Use the regex to break line into fields
|
45
31
|
# Emit each record as flat line
|
@@ -49,14 +35,13 @@ module ApacheLogParser
|
|
49
35
|
if m
|
50
36
|
(ip, j1, j2,
|
51
37
|
ts_day, ts_mo, ts_year,
|
52
|
-
ts_hour, ts_min, ts_sec,
|
38
|
+
ts_hour, ts_min, ts_sec, tz,
|
53
39
|
http_method, path, protocol,
|
54
40
|
response_code, duration,
|
55
41
|
referer, ua, *cruft) = m.captures
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
|
42
|
+
date = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
43
|
+
time = [ts_hour, ts_min, ts_sec].join("")
|
44
|
+
yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
|
60
45
|
else
|
61
46
|
yield [:unparseable, line]
|
62
47
|
end
|
@@ -65,7 +50,7 @@ module ApacheLogParser
|
|
65
50
|
end
|
66
51
|
end
|
67
52
|
|
68
|
-
Wukong
|
53
|
+
Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
|
69
54
|
|
70
55
|
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
71
56
|
|
@@ -1,3 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
class Logline < Struct.new(
|
6
|
+
:ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
|
7
|
+
|
8
|
+
def page_type
|
9
|
+
case
|
10
|
+
when path =~ /\.(css|js)$/ then :asset
|
11
|
+
when path =~ /\.(png|gif|ico)$/ then :image
|
12
|
+
when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
|
13
|
+
else :other
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def is_page?
|
18
|
+
page_type == :page
|
19
|
+
end
|
20
|
+
|
21
|
+
def day_hr
|
22
|
+
visit.date + visit.time[0..1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
1
27
|
#
|
2
28
|
# Group all visitors, and then troll through all the pages they've visited
|
3
29
|
# breaking each into distinct visits (where more than an [hour|day|whatever]
|
@@ -12,6 +38,11 @@
|
|
12
38
|
#
|
13
39
|
# where the partition key is visitor_id, and we sort by visitor_id and datetime.
|
14
40
|
#
|
41
|
+
class VisitorDatePath < Wukong::Streamer::StructStreamer
|
42
|
+
def process visit, *args
|
43
|
+
yield [visit.ip, visit.day_hr, visit.path]
|
44
|
+
end
|
45
|
+
end
|
15
46
|
|
16
47
|
#
|
17
48
|
# Reducer:
|
@@ -34,3 +65,11 @@
|
|
34
65
|
# page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
35
66
|
#
|
36
67
|
# to discover all trails passing through a given page.
|
68
|
+
class VisitorDatePath < Wukong::Streamer::Reducer
|
69
|
+
def get_key ip, day_hr, path, *args
|
70
|
+
[ip, day_hr]
|
71
|
+
end
|
72
|
+
def process_group visit, *args
|
73
|
+
yield [visit.ip, visit.day_hr, visit.path]
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
class Logline < Struct.new(
|
6
|
+
:ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
|
7
|
+
|
8
|
+
def page_type
|
9
|
+
case
|
10
|
+
when path =~ /\.(css|js)$/ then :asset
|
11
|
+
when path =~ /\.(png|gif|ico)$/ then :image
|
12
|
+
when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
|
13
|
+
else :other
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def is_page?
|
18
|
+
page_type == :page
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class PageFilter < Wukong::Streamer::StructStreamer
|
23
|
+
def process visit, *args
|
24
|
+
yield visit.ua if visit.
|
25
|
+
end
|
26
|
+
end
|
27
|
+
Wukong.run(PageFilter)
|
data/examples/size.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
module Size
|
6
6
|
#
|
@@ -56,5 +56,6 @@ end
|
|
56
56
|
# Execute the script
|
57
57
|
Size::Script.new(
|
58
58
|
nil,
|
59
|
-
Size::Reducer
|
59
|
+
Size::Reducer,
|
60
|
+
:reduce_tasks => 1
|
60
61
|
).run
|
@@ -1,10 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong/script'
|
5
4
|
require 'wukong/streamer/count_keys'
|
6
5
|
|
7
|
-
|
8
6
|
#
|
9
7
|
# Ch3ck out dis moist azz code bitches!!
|
10
8
|
#
|
@@ -70,14 +68,14 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
70
68
|
table << "TRSTRANK_TABLE = " << count_bin.inspect
|
71
69
|
table.close
|
72
70
|
end
|
73
|
-
|
71
|
+
|
74
72
|
#
|
75
|
-
# Return percentile of a given trstrank for a given follower bracket
|
73
|
+
# Return percentile of a given trstrank for a given follower bracket
|
76
74
|
#
|
77
75
|
def percentile bin, rank
|
78
|
-
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
76
|
+
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
79
77
|
end
|
80
|
-
|
78
|
+
|
81
79
|
#
|
82
80
|
# Return the count of values less than rank
|
83
81
|
#
|
@@ -119,7 +117,7 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
119
117
|
big_list.uniq.sort{|x,y| x.first <=> y.first}
|
120
118
|
end
|
121
119
|
|
122
|
-
|
120
|
+
|
123
121
|
#
|
124
122
|
# Nothing to see here, move along
|
125
123
|
#
|
@@ -132,11 +130,11 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
132
130
|
num.times do |i|
|
133
131
|
x = pair1.first + (i+1).to_f*dx
|
134
132
|
y = m*x + b
|
135
|
-
points << [x,y]
|
133
|
+
points << [x,y]
|
136
134
|
end
|
137
135
|
points # return an array of pairs
|
138
136
|
end
|
139
|
-
|
137
|
+
|
140
138
|
end
|
141
139
|
|
142
140
|
Wukong::Script.new(Mapper,Reducer).run
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
# Run as (local mode)
|
6
6
|
#
|
@@ -15,14 +15,14 @@ require 'wukong'
|
|
15
15
|
# cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
|
16
16
|
#
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
# and emits the whole record if the field matches
|
21
|
-
#
|
22
|
-
class GrepMapper < Wukong::Streamer::RecordStreamer
|
23
|
-
|
18
|
+
class Mapper < LineStreamer
|
19
|
+
include Filter
|
24
20
|
MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
|
25
21
|
|
22
|
+
#
|
23
|
+
# A very simple mapper -- looks for a regex match in one field,
|
24
|
+
# and emits the whole record if the field matches
|
25
|
+
#
|
26
26
|
#
|
27
27
|
# Given a series of records like:
|
28
28
|
#
|
@@ -31,13 +31,10 @@ class GrepMapper < Wukong::Streamer::RecordStreamer
|
|
31
31
|
#
|
32
32
|
# emits only the lines matching that regex
|
33
33
|
#
|
34
|
-
def
|
35
|
-
|
34
|
+
def emit? line
|
35
|
+
MATCHER.match line
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
39
|
# Execute the script
|
40
|
-
Wukong
|
41
|
-
GrepMapper,
|
42
|
-
nil
|
43
|
-
).run
|
40
|
+
Wukong.run(Mapper)
|