wukong 1.5.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +32 -0
- data/README.textile +58 -12
- data/TODO.textile +0 -8
- data/bin/hdp-bzip +12 -17
- data/bin/hdp-kill-task +1 -1
- data/bin/hdp-sort +7 -7
- data/bin/hdp-stream +7 -7
- data/bin/hdp-stream-flat +2 -3
- data/bin/setcat +11 -0
- data/bin/uniq-ord +59 -0
- data/examples/corpus/bucket_counter.rb +47 -0
- data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
- data/examples/corpus/sentence_coocurrence.rb +70 -0
- data/examples/emr/README.textile +110 -0
- data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
- data/examples/emr/elastic_mapreduce_example.rb +2 -2
- data/examples/ignore_me/counting.rb +56 -0
- data/examples/ignore_me/grouper.rb +71 -0
- data/examples/network_graph/adjacency_list.rb +2 -2
- data/examples/network_graph/breadth_first_search.rb +14 -21
- data/examples/network_graph/gen_multi_edge.rb +22 -13
- data/examples/pagerank/pagerank.rb +1 -1
- data/examples/pagerank/pagerank_initialize.rb +6 -10
- data/examples/sample_records.rb +6 -16
- data/examples/server_logs/apache_log_parser.rb +7 -22
- data/examples/server_logs/breadcrumbs.rb +39 -0
- data/examples/server_logs/logline.rb +27 -0
- data/examples/size.rb +3 -2
- data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
- data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
- data/examples/stupidly_simple_filter.rb +11 -14
- data/examples/word_count.rb +16 -36
- data/lib/wukong/and_pig.rb +2 -15
- data/lib/wukong/logger.rb +7 -28
- data/lib/wukong/periodic_monitor.rb +24 -9
- data/lib/wukong/script/emr_command.rb +1 -0
- data/lib/wukong/script/hadoop_command.rb +31 -29
- data/lib/wukong/script.rb +19 -14
- data/lib/wukong/store/cassandra_model.rb +2 -1
- data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
- data/lib/wukong/streamer/base.rb +44 -3
- data/lib/wukong/streamer/counting_reducer.rb +12 -12
- data/lib/wukong/streamer/filter.rb +2 -2
- data/lib/wukong/streamer/list_reducer.rb +3 -3
- data/lib/wukong/streamer/reducer.rb +11 -0
- data/lib/wukong/streamer.rb +7 -3
- data/lib/wukong.rb +7 -3
- data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
- data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
- data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
- data/wukong.gemspec +257 -285
- metadata +45 -62
- data/examples/cassandra_streaming/avromapper.rb +0 -85
- data/examples/cassandra_streaming/cassandra.avpr +0 -468
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
- data/examples/cassandra_streaming/catter.sh +0 -45
- data/examples/cassandra_streaming/client_schema.avpr +0 -211
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +0 -1
- data/examples/cassandra_streaming/pyreduce.sh +0 -1
- data/examples/cassandra_streaming/smutation.avpr +0 -188
- data/examples/cassandra_streaming/streamer.sh +0 -51
- data/examples/cassandra_streaming/struct_loader.rb +0 -24
- data/examples/count_keys.rb +0 -56
- data/examples/count_keys_at_mapper.rb +0 -57
- data/examples/emr/README-elastic_map_reduce.textile +0 -26
- data/examples/keystore/cassandra_batch_test.rb +0 -41
- data/examples/keystore/conditional_outputter_example.rb +0 -70
- data/examples/store/chunked_store_example.rb +0 -18
- data/lib/wukong/dfs.rb +0 -81
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
- data/lib/wukong/keystore/redis_db.rb +0 -24
- data/lib/wukong/keystore/tyrant_db.rb +0 -137
- data/lib/wukong/keystore/tyrant_notes.textile +0 -145
- data/lib/wukong/models/graph.rb +0 -25
- data/lib/wukong/monitor/chunked_store.rb +0 -23
- data/lib/wukong/monitor/periodic_logger.rb +0 -34
- data/lib/wukong/monitor/periodic_monitor.rb +0 -70
- data/lib/wukong/monitor.rb +0 -7
- data/lib/wukong/rdf.rb +0 -104
- data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
- data/lib/wukong/streamer/count_keys.rb +0 -30
- data/lib/wukong/streamer/count_lines.rb +0 -26
- data/lib/wukong/streamer/em_streamer.rb +0 -7
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
- data/lib/wukong/wukong_class.rb +0 -21
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
require 'bloomfilter-rb'
|
7
|
+
|
8
|
+
SIZE = 2**24
|
9
|
+
|
10
|
+
class BucketCounter
|
11
|
+
def initialize(opts = {})
|
12
|
+
@opts = {
|
13
|
+
:size => 100,
|
14
|
+
:server => {}
|
15
|
+
}.merge opts
|
16
|
+
@db = ::Redis.new(@opts[:server])
|
17
|
+
@size = opts[:size]
|
18
|
+
end
|
19
|
+
|
20
|
+
def key_for val
|
21
|
+
(val.hash % @size)
|
22
|
+
end
|
23
|
+
|
24
|
+
def insert(val)
|
25
|
+
@db.incr(key_for(val))
|
26
|
+
end
|
27
|
+
alias :<< :insert
|
28
|
+
|
29
|
+
def delete(val)
|
30
|
+
if @db.decr(key_for(val)).to_i <= 0
|
31
|
+
@db.del(key_for(val))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def [](val)
|
36
|
+
@db.get(key_for(val)).to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear
|
40
|
+
@db.flushdb
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
bf = BucketCounter.new(:size => 1_000, :server => {:host => 'localhost'})
|
45
|
+
bf.clear
|
46
|
+
counts = Hash.new{|h,k| h[k] = 0 }
|
47
|
+
|
48
|
+
doc = File.read(__FILE__)
|
49
|
+
doc.split(/\W+/).each do |word|
|
50
|
+
counts[word] += 1
|
51
|
+
bf << word
|
52
|
+
end
|
53
|
+
|
54
|
+
counts.keys.sort.each do |word|
|
55
|
+
puts [ bf[word] - counts[word], bf[word], counts[word], word.hash % SIZE, word ].join("\t")
|
56
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'backports'
|
4
|
+
require 'backports/1.8.8'
|
5
|
+
require 'extlib'
|
6
|
+
|
7
|
+
class Source
|
8
|
+
# include Enumerable
|
9
|
+
attr_reader :streamer
|
10
|
+
|
11
|
+
def recordize line
|
12
|
+
# line.strip.split("\t")
|
13
|
+
[line[0..5]]
|
14
|
+
end
|
15
|
+
|
16
|
+
def each *args
|
17
|
+
$stdin.each(*args) do |raw_record|
|
18
|
+
record = recordize(raw_record)
|
19
|
+
next if record.blank?
|
20
|
+
yield *record
|
21
|
+
break if raw_record =~ /end/
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# def process_group group
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
|
30
|
+
class Streamer
|
31
|
+
|
32
|
+
def recordize line
|
33
|
+
[line[0..5]]
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_group
|
37
|
+
while not $stdin.eof? do
|
38
|
+
Enumerator.new do |yielder|
|
39
|
+
$stdin.each do |line|
|
40
|
+
yield yielder
|
41
|
+
p yielder
|
42
|
+
break if line =~ /end/
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
foo = Streamer.new
|
50
|
+
|
51
|
+
foo.each_group do |group|
|
52
|
+
puts "hi"
|
53
|
+
p group.each do |line|
|
54
|
+
p line.reverse
|
55
|
+
end
|
56
|
+
# .map do |record|
|
57
|
+
# 1
|
58
|
+
# end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
# i = 0
|
63
|
+
# # s = source.new(Streamer.new)
|
64
|
+
# $stdin.each do
|
65
|
+
# process_group do |output|
|
66
|
+
# puts output
|
67
|
+
# end
|
68
|
+
# $stderr.puts [Time.now, i] if (i += 1) % 10 == 0
|
69
|
+
# end
|
70
|
+
|
71
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: <<
|
3
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
#
|
6
6
|
# Use this script to do a Breadth-First Search (BFS) of a graph.
|
@@ -9,19 +9,18 @@ require 'wukong'
|
|
9
9
|
# ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
|
10
10
|
#
|
11
11
|
# For example, given an edge list in the file '1path.tsv' that looks like
|
12
|
-
# 1path
|
13
|
-
# 1path n1
|
12
|
+
# 1path n1 n2
|
13
|
+
# 1path n1 n3
|
14
14
|
# ... and so forth ...
|
15
15
|
# you can run
|
16
16
|
# for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
|
17
17
|
# to do a 9-deep breadth-first search.
|
18
18
|
#
|
19
19
|
module Gen1HoodEdges
|
20
|
-
class Mapper < Wukong::Streamer::
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
self.tail = options[:tail]
|
20
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
21
|
+
def initialize
|
22
|
+
@head = Settings[:head]
|
23
|
+
@tail = Settings[:tail]
|
25
24
|
end
|
26
25
|
def process rsrc, *nodes
|
27
26
|
yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
|
@@ -37,8 +36,8 @@ module Gen1HoodEdges
|
|
37
36
|
#
|
38
37
|
class Reducer < Wukong::Streamer::AccumulatingReducer
|
39
38
|
attr_accessor :paths_in, :out_rsrc
|
40
|
-
def initialize
|
41
|
-
self.out_rsrc =
|
39
|
+
def initialize
|
40
|
+
self.out_rsrc = Settings[:out_rsrc]
|
42
41
|
end
|
43
42
|
# clear the list of incoming paths
|
44
43
|
def start! *args
|
@@ -63,17 +62,11 @@ module Gen1HoodEdges
|
|
63
62
|
mid
|
64
63
|
end
|
65
64
|
end
|
66
|
-
|
67
|
-
class Script < Wukong::Script
|
68
|
-
def default_options
|
69
|
-
super.merge :sort_fields => 2, :partition_fields => 1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
65
|
end
|
74
66
|
|
75
67
|
# Execute the script
|
76
|
-
|
68
|
+
Wukong.run(
|
77
69
|
Gen1HoodEdges::Mapper,
|
78
|
-
Gen1HoodEdges::Reducer
|
79
|
-
|
70
|
+
Gen1HoodEdges::Reducer,
|
71
|
+
:sort_fields => 2, :partition_fields => 1
|
72
|
+
)
|
@@ -2,7 +2,6 @@
|
|
2
2
|
require 'rubygems'
|
3
3
|
$: << File.dirname(__FILE__)+'/../../lib'
|
4
4
|
require 'wukong'
|
5
|
-
require 'wukong/models/graph'; include Wukong::Models
|
6
5
|
|
7
6
|
#
|
8
7
|
# Takes any number of flavors of directed edge with the form
|
@@ -88,17 +87,27 @@ module GenMultiEdge
|
|
88
87
|
yield self.multi_edge
|
89
88
|
end
|
90
89
|
end
|
90
|
+
end
|
91
91
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
class Script < Wukong::Script
|
97
|
-
def default_options
|
98
|
-
super.merge :sort_fields => 2
|
99
|
-
end
|
100
|
-
end
|
92
|
+
Edge = TypedStruct.new(
|
93
|
+
[:src, Integer],
|
94
|
+
[:dest, Integer]
|
95
|
+
)
|
101
96
|
|
102
|
-
|
103
|
-
|
104
|
-
|
97
|
+
MultiEdge = TypedStruct.new(
|
98
|
+
[:src, Integer],
|
99
|
+
[:dest, Integer],
|
100
|
+
[:a_follows_b, Integer],
|
101
|
+
[:b_follows_a, Integer],
|
102
|
+
[:a_replies_b, Integer],
|
103
|
+
[:b_replies_a, Integer],
|
104
|
+
[:a_atsigns_b, Integer],
|
105
|
+
[:b_atsigns_a, Integer],
|
106
|
+
[:a_retweets_b, Integer],
|
107
|
+
[:b_retweets_a, Integer],
|
108
|
+
[:a_favorites_b, Integer],
|
109
|
+
[:b_favorites_a, Integer]
|
110
|
+
)
|
111
|
+
|
112
|
+
# Execute the script
|
113
|
+
Script.new(Mapper, Reducer, :sort_fields => 2).run
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
-
require 'wukong'
|
4
|
-
require 'wukong/streamer/
|
3
|
+
require 'wukong/script'
|
4
|
+
require 'wukong/streamer/list_reducer'
|
5
5
|
|
6
6
|
module PageRank
|
7
7
|
class Script < Wukong::Script
|
@@ -15,10 +15,6 @@ module PageRank
|
|
15
15
|
def map_command
|
16
16
|
%Q{/usr/bin/cut -d"\t" -f2,3}
|
17
17
|
end
|
18
|
-
|
19
|
-
def default_options
|
20
|
-
super.merge :extra_args => ' -jobconf io.sort.record.percent=0.25 '
|
21
|
-
end
|
22
18
|
end
|
23
19
|
|
24
20
|
#
|
@@ -28,18 +24,18 @@ module PageRank
|
|
28
24
|
#
|
29
25
|
class Reducer < Wukong::Streamer::ListReducer
|
30
26
|
def accumulate src, dest
|
31
|
-
|
27
|
+
@values << dest
|
32
28
|
end
|
33
29
|
|
34
30
|
# Emit src, initial pagerank, and flattened dests list
|
35
31
|
def finalize
|
36
|
-
|
37
|
-
yield [key, 1.0,
|
32
|
+
@values = ['dummy'] if @values.blank?
|
33
|
+
yield [key, 1.0, @values.to_a.join(",")]
|
38
34
|
end
|
39
35
|
end
|
40
36
|
|
41
37
|
# Execute the script
|
42
|
-
Script.new(nil, PageRank::Reducer).run
|
38
|
+
Script.new(nil, PageRank::Reducer, :io_sort_record_percent => 0.25).run
|
43
39
|
end
|
44
40
|
|
45
41
|
|
data/examples/sample_records.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../lib'
|
3
2
|
require 'rubygems'
|
4
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
|
5
6
|
|
6
7
|
#
|
7
8
|
# Probabilistically emit some fraction of record/lines
|
@@ -14,30 +15,19 @@ require 'wukong'
|
|
14
15
|
class Mapper < Wukong::Streamer::LineStreamer
|
15
16
|
include Wukong::Streamer::Filter
|
16
17
|
|
17
|
-
#
|
18
|
-
# floating-point number between 0 and 1 giving the fraction of lines to emit:
|
19
|
-
# at sampling_fraction=1 all records are emitted, at 0 none are.
|
20
|
-
#
|
21
|
-
# Takes its value from a mandatory command-line option
|
22
|
-
#
|
23
|
-
def sampling_fraction
|
24
|
-
@sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
|
25
|
-
raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
|
26
|
-
end
|
27
|
-
|
28
18
|
#
|
29
19
|
# randomly decide to emit +sampling_fraction+ fraction of lines
|
30
20
|
#
|
31
21
|
def emit? line
|
32
|
-
rand <
|
22
|
+
rand < Settings.sampling_fraction
|
33
23
|
end
|
34
24
|
end
|
35
25
|
|
36
26
|
#
|
37
27
|
# Executes the script
|
38
28
|
#
|
39
|
-
Wukong
|
29
|
+
Wukong.run( Mapper,
|
40
30
|
nil,
|
41
31
|
:reduce_tasks => 0,
|
42
32
|
:reuse_jvms => true
|
43
|
-
)
|
33
|
+
)
|
@@ -1,22 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../lib'
|
3
2
|
require 'rubygems'
|
4
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
5
4
|
|
6
|
-
MONTHS = {
|
7
|
-
'Jan' => '01',
|
8
|
-
'Feb' => '02',
|
9
|
-
'Mar' => '03',
|
10
|
-
'Apr' => '04',
|
11
|
-
'May' => '05',
|
12
|
-
'Jun' => '06',
|
13
|
-
'Jul' => '07',
|
14
|
-
'Aug' => '08',
|
15
|
-
'Sep' => '09',
|
16
|
-
'Oct' => '10',
|
17
|
-
'Nov' => '11',
|
18
|
-
'Dec' => '12',
|
19
|
-
}
|
20
5
|
module ApacheLogParser
|
21
6
|
class Mapper < Wukong::Streamer::LineStreamer
|
22
7
|
|
@@ -40,6 +25,7 @@ module ApacheLogParser
|
|
40
25
|
\s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
|
41
26
|
\s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
42
27
|
\z}x)
|
28
|
+
MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
|
43
29
|
|
44
30
|
# Use the regex to break line into fields
|
45
31
|
# Emit each record as flat line
|
@@ -49,14 +35,13 @@ module ApacheLogParser
|
|
49
35
|
if m
|
50
36
|
(ip, j1, j2,
|
51
37
|
ts_day, ts_mo, ts_year,
|
52
|
-
ts_hour, ts_min, ts_sec,
|
38
|
+
ts_hour, ts_min, ts_sec, tz,
|
53
39
|
http_method, path, protocol,
|
54
40
|
response_code, duration,
|
55
41
|
referer, ua, *cruft) = m.captures
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
|
42
|
+
date = [ts_year, MONTHS[ts_mo], ts_day].join("")
|
43
|
+
time = [ts_hour, ts_min, ts_sec].join("")
|
44
|
+
yield [:logline, ip, date, time, http_method, protocol, path, response_code, duration, referer, ua, tz]
|
60
45
|
else
|
61
46
|
yield [:unparseable, line]
|
62
47
|
end
|
@@ -65,7 +50,7 @@ module ApacheLogParser
|
|
65
50
|
end
|
66
51
|
end
|
67
52
|
|
68
|
-
Wukong
|
53
|
+
Wukong.run(ApacheLogParser::Mapper, nil, :sort_fields => 7)
|
69
54
|
|
70
55
|
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|
71
56
|
|
@@ -1,3 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
class Logline < Struct.new(
|
6
|
+
:ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
|
7
|
+
|
8
|
+
def page_type
|
9
|
+
case
|
10
|
+
when path =~ /\.(css|js)$/ then :asset
|
11
|
+
when path =~ /\.(png|gif|ico)$/ then :image
|
12
|
+
when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
|
13
|
+
else :other
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def is_page?
|
18
|
+
page_type == :page
|
19
|
+
end
|
20
|
+
|
21
|
+
def day_hr
|
22
|
+
visit.date + visit.time[0..1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
1
27
|
#
|
2
28
|
# Group all visitors, and then troll through all the pages they've visited
|
3
29
|
# breaking each into distinct visits (where more than an [hour|day|whatever]
|
@@ -12,6 +38,11 @@
|
|
12
38
|
#
|
13
39
|
# where the partition key is visitor_id, and we sort by visitor_id and datetime.
|
14
40
|
#
|
41
|
+
class VisitorDatePath < Wukong::Streamer::StructStreamer
|
42
|
+
def process visit, *args
|
43
|
+
yield [visit.ip, visit.day_hr, visit.path]
|
44
|
+
end
|
45
|
+
end
|
15
46
|
|
16
47
|
#
|
17
48
|
# Reducer:
|
@@ -34,3 +65,11 @@
|
|
34
65
|
# page_trails <pagen> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
|
35
66
|
#
|
36
67
|
# to discover all trails passing through a given page.
|
68
|
+
class VisitorDatePath < Wukong::Streamer::Reducer
|
69
|
+
def get_key ip, day_hr, path, *args
|
70
|
+
[ip, day_hr]
|
71
|
+
end
|
72
|
+
def process_group visit, *args
|
73
|
+
yield [visit.ip, visit.day_hr, visit.path]
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong/script'
|
4
|
+
|
5
|
+
class Logline < Struct.new(
|
6
|
+
:ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)
|
7
|
+
|
8
|
+
def page_type
|
9
|
+
case
|
10
|
+
when path =~ /\.(css|js)$/ then :asset
|
11
|
+
when path =~ /\.(png|gif|ico)$/ then :image
|
12
|
+
when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
|
13
|
+
else :other
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def is_page?
|
18
|
+
page_type == :page
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class PageFilter < Wukong::Streamer::StructStreamer
|
23
|
+
def process visit, *args
|
24
|
+
yield visit.ua if visit.
|
25
|
+
end
|
26
|
+
end
|
27
|
+
Wukong.run(PageFilter)
|
data/examples/size.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
-
require 'wukong'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
module Size
|
6
6
|
#
|
@@ -56,5 +56,6 @@ end
|
|
56
56
|
# Execute the script
|
57
57
|
Size::Script.new(
|
58
58
|
nil,
|
59
|
-
Size::Reducer
|
59
|
+
Size::Reducer,
|
60
|
+
:reduce_tasks => 1
|
60
61
|
).run
|
@@ -1,10 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong/script'
|
5
4
|
require 'wukong/streamer/count_keys'
|
6
5
|
|
7
|
-
|
8
6
|
#
|
9
7
|
# Ch3ck out dis moist azz code bitches!!
|
10
8
|
#
|
@@ -70,14 +68,14 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
70
68
|
table << "TRSTRANK_TABLE = " << count_bin.inspect
|
71
69
|
table.close
|
72
70
|
end
|
73
|
-
|
71
|
+
|
74
72
|
#
|
75
|
-
# Return percentile of a given trstrank for a given follower bracket
|
73
|
+
# Return percentile of a given trstrank for a given follower bracket
|
76
74
|
#
|
77
75
|
def percentile bin, rank
|
78
|
-
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
76
|
+
((count_less_than(bin,rank) + 0.5*frequency_of(bin,rank))/ total_num(bin) )*100.0
|
79
77
|
end
|
80
|
-
|
78
|
+
|
81
79
|
#
|
82
80
|
# Return the count of values less than rank
|
83
81
|
#
|
@@ -119,7 +117,7 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
119
117
|
big_list.uniq.sort{|x,y| x.first <=> y.first}
|
120
118
|
end
|
121
119
|
|
122
|
-
|
120
|
+
|
123
121
|
#
|
124
122
|
# Nothing to see here, move along
|
125
123
|
#
|
@@ -132,11 +130,11 @@ class Reducer < Wukong::Streamer::AccumulatingReducer
|
|
132
130
|
num.times do |i|
|
133
131
|
x = pair1.first + (i+1).to_f*dx
|
134
132
|
y = m*x + b
|
135
|
-
points << [x,y]
|
133
|
+
points << [x,y]
|
136
134
|
end
|
137
135
|
points # return an array of pairs
|
138
136
|
end
|
139
|
-
|
137
|
+
|
140
138
|
end
|
141
139
|
|
142
140
|
Wukong::Script.new(Mapper,Reducer).run
|
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'wukong'
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
3
|
+
require 'wukong/script'
|
4
4
|
|
5
5
|
# Run as (local mode)
|
6
6
|
#
|
@@ -15,14 +15,14 @@ require 'wukong'
|
|
15
15
|
# cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
|
16
16
|
#
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
# and emits the whole record if the field matches
|
21
|
-
#
|
22
|
-
class GrepMapper < Wukong::Streamer::RecordStreamer
|
23
|
-
|
18
|
+
class Mapper < LineStreamer
|
19
|
+
include Filter
|
24
20
|
MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
|
25
21
|
|
22
|
+
#
|
23
|
+
# A very simple mapper -- looks for a regex match in one field,
|
24
|
+
# and emits the whole record if the field matches
|
25
|
+
#
|
26
26
|
#
|
27
27
|
# Given a series of records like:
|
28
28
|
#
|
@@ -31,13 +31,10 @@ class GrepMapper < Wukong::Streamer::RecordStreamer
|
|
31
31
|
#
|
32
32
|
# emits only the lines matching that regex
|
33
33
|
#
|
34
|
-
def
|
35
|
-
|
34
|
+
def emit? line
|
35
|
+
MATCHER.match line
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
39
|
# Execute the script
|
40
|
-
Wukong
|
41
|
-
GrepMapper,
|
42
|
-
nil
|
43
|
-
).run
|
40
|
+
Wukong.run(Mapper)
|