wukong 3.0.0.pre3 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +689 -50
- data/bin/wu-local +1 -74
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/loadable.rb +2 -0
- data/examples/string_reverser.rb +7 -0
- data/lib/hanuman/stage.rb +2 -2
- data/lib/wukong.rb +21 -10
- data/lib/wukong/dataflow.rb +2 -5
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +11 -1
- data/lib/wukong/local.rb +40 -0
- data/lib/wukong/local/event_machine_driver.rb +27 -0
- data/lib/wukong/local/runner.rb +98 -0
- data/lib/wukong/local/stdio_driver.rb +44 -0
- data/lib/wukong/local/tcp_driver.rb +47 -0
- data/lib/wukong/logger.rb +16 -7
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +57 -15
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +151 -128
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/spec_helpers.rb +4 -12
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
- data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
- data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
- data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/filters.rb +134 -8
- data/lib/wukong/widget/processors.rb +64 -5
- data/lib/wukong/widget/reducers/bin.rb +68 -18
- data/lib/wukong/widget/reducers/count.rb +12 -0
- data/lib/wukong/widget/reducers/group.rb +48 -5
- data/lib/wukong/widget/reducers/group_concat.rb +30 -2
- data/lib/wukong/widget/reducers/moments.rb +4 -4
- data/lib/wukong/widget/reducers/sort.rb +53 -3
- data/lib/wukong/widget/serializers.rb +37 -12
- data/lib/wukong/widget/utils.rb +1 -1
- data/spec/spec_helper.rb +20 -2
- data/spec/wukong/driver_spec.rb +2 -0
- data/spec/wukong/local/runner_spec.rb +40 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/processor_spec.rb +22 -0
- data/spec/wukong/runner_spec.rb +128 -8
- data/spec/wukong/widget/filters_spec.rb +28 -10
- data/spec/wukong/widget/processors_spec.rb +5 -5
- data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
- data/spec/wukong/widget/reducers/count_spec.rb +1 -1
- data/spec/wukong/widget/reducers/group_spec.rb +7 -6
- data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
- data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
- data/spec/wukong/widget/serializers_spec.rb +84 -88
- data/spec/wukong/wu-local_spec.rb +109 -0
- metadata +43 -20
- data/bin/wu-server +0 -70
- data/lib/wukong/boot.rb +0 -96
- data/lib/wukong/configuration.rb +0 -8
- data/lib/wukong/emitter.rb +0 -22
- data/lib/wukong/server.rb +0 -119
- data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
- data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
- data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
- data/spec/wukong/local_runner_spec.rb +0 -31
- data/spec/wukong/wu_local_spec.rb +0 -125
data/lib/wukong/version.rb
CHANGED
@@ -5,6 +5,10 @@ module Wukong
|
|
5
5
|
# criterion.
|
6
6
|
class Filter < Processor
|
7
7
|
|
8
|
+
description <<EOF
|
9
|
+
A processor which filters input records according to some criterion.
|
10
|
+
EOF
|
11
|
+
|
8
12
|
# Process a `record` by yielding it only if it should be
|
9
13
|
# selected by this filter.
|
10
14
|
#
|
@@ -61,6 +65,7 @@ module Wukong
|
|
61
65
|
# @see Filter
|
62
66
|
# @see Null
|
63
67
|
class Identity < Filter
|
68
|
+
description "This processor passes all records unmodified."
|
64
69
|
register
|
65
70
|
end
|
66
71
|
|
@@ -84,6 +89,9 @@ module Wukong
|
|
84
89
|
# @see Filter
|
85
90
|
# @see All
|
86
91
|
class Null < Filter
|
92
|
+
|
93
|
+
description "This processor acts as a filter which passes no records at all."
|
94
|
+
|
87
95
|
# Prevents any records from passing because it always returns
|
88
96
|
# `false`.
|
89
97
|
#
|
@@ -117,8 +125,21 @@ module Wukong
|
|
117
125
|
# @see NotRegexpFilter
|
118
126
|
class RegexpFilter < Filter
|
119
127
|
|
120
|
-
|
121
|
-
|
128
|
+
description <<EOF
|
129
|
+
This processor only passes records which match against a given regular
|
130
|
+
expression.
|
131
|
+
|
132
|
+
$ cat input
|
133
|
+
apple
|
134
|
+
banana
|
135
|
+
cat
|
136
|
+
$ cat input | wu-local regexp --match='^a'
|
137
|
+
apple
|
138
|
+
|
139
|
+
If no --match argument is given, all records will be passed.
|
140
|
+
EOF
|
141
|
+
|
142
|
+
field :match, Regexp, :doc => "Regular expression to match against"
|
122
143
|
|
123
144
|
# Selects a `record` only if it matches this widget's `match`
|
124
145
|
# field.
|
@@ -154,6 +175,22 @@ module Wukong
|
|
154
175
|
# @see Filter
|
155
176
|
# @see NotRegexpFilter
|
156
177
|
class NotRegexpFilter < RegexpFilter
|
178
|
+
|
179
|
+
description <<EOF
|
180
|
+
This processor only passes records which fail to match against a given
|
181
|
+
regular expression.
|
182
|
+
|
183
|
+
$ cat input
|
184
|
+
apple
|
185
|
+
banana
|
186
|
+
cat
|
187
|
+
$ cat input | wu-local not_regexp --match='^a'
|
188
|
+
banana
|
189
|
+
cat
|
190
|
+
|
191
|
+
If no --match argument is given, all records will be passed.
|
192
|
+
EOF
|
193
|
+
|
157
194
|
# Select a `record` only if it <b>doesn't</b> match this
|
158
195
|
# widget's `match` field.
|
159
196
|
#
|
@@ -189,8 +226,22 @@ module Wukong
|
|
189
226
|
# @see Filter
|
190
227
|
class Limit < Filter
|
191
228
|
|
192
|
-
|
193
|
-
|
229
|
+
description <<EOF
|
230
|
+
This processor passes a certain number of records and then stops
|
231
|
+
passing any, acting as a limit.
|
232
|
+
|
233
|
+
$ cat input
|
234
|
+
1
|
235
|
+
2
|
236
|
+
3
|
237
|
+
$ cat input | wu-local limit --max=2
|
238
|
+
1
|
239
|
+
2
|
240
|
+
|
241
|
+
If no --max argument is given, all records will be passed.
|
242
|
+
EOF
|
243
|
+
|
244
|
+
field :max, Integer, :doc => "Maximum number of records to let pass"
|
194
245
|
|
195
246
|
# The current record count.
|
196
247
|
attr_accessor :count
|
@@ -206,7 +257,7 @@ module Wukong
|
|
206
257
|
# @param [Object] record
|
207
258
|
# @return [true, false]
|
208
259
|
def select?(record)
|
209
|
-
keep = @count < max
|
260
|
+
keep = (max ? @count < max : true)
|
210
261
|
@count += 1
|
211
262
|
keep
|
212
263
|
end
|
@@ -236,9 +287,23 @@ module Wukong
|
|
236
287
|
# @see Limit
|
237
288
|
class Sample < Filter
|
238
289
|
|
239
|
-
|
240
|
-
|
241
|
-
|
290
|
+
description <<EOF
|
291
|
+
This processor will pass input records with a certain frequency,
|
292
|
+
acting as a random sampler.
|
293
|
+
|
294
|
+
$ cat input
|
295
|
+
1
|
296
|
+
2
|
297
|
+
3
|
298
|
+
4
|
299
|
+
$ cat input | wu-local sample --fraction=0.5
|
300
|
+
1
|
301
|
+
4
|
302
|
+
|
303
|
+
If no --fraction is given, all records will be passed.
|
304
|
+
EOF
|
305
|
+
|
306
|
+
field :fraction, Float, :default => 1.0, :doc => "Fraction of records to let pass. Must be between 0 and 1.0"
|
242
307
|
|
243
308
|
# Selects a `record` randomly, with a probability given the the
|
244
309
|
# `fraction` for this widget.
|
@@ -307,5 +372,66 @@ module Wukong
|
|
307
372
|
register
|
308
373
|
end
|
309
374
|
|
375
|
+
# Emit only the first `n` records.
|
376
|
+
#
|
377
|
+
# @see Filter
|
378
|
+
class Head < Filter
|
379
|
+
|
380
|
+
field :n, Integer, :default => 10, :doc => "Number of records to let pass"
|
381
|
+
|
382
|
+
# The current record count.
|
383
|
+
attr_accessor :count
|
384
|
+
|
385
|
+
# Initializes the record count to zero.
|
386
|
+
def setup
|
387
|
+
self.count = 0
|
388
|
+
end
|
389
|
+
|
390
|
+
# Select a record only if we're below the maximum number of
|
391
|
+
# records.
|
392
|
+
#
|
393
|
+
# @param [Object] record
|
394
|
+
# @return [true, false]
|
395
|
+
def select?(record)
|
396
|
+
keep = @count < n
|
397
|
+
@count += 1
|
398
|
+
keep
|
399
|
+
end
|
400
|
+
register
|
401
|
+
end
|
402
|
+
|
403
|
+
# Skip the first `n` records.
|
404
|
+
#
|
405
|
+
# Works slightly differently than the UNIX `tail` command which
|
406
|
+
# prints the last `n` records. This notion is less useful in a
|
407
|
+
# streaming context, so think of this filter as the equivalent of
|
408
|
+
# `tail -n+`.
|
409
|
+
#
|
410
|
+
# @see Filter
|
411
|
+
class Tail < Filter
|
412
|
+
|
413
|
+
field :n, Integer, :default => 0, :doc => "Number of records to skip before letting records pass"
|
414
|
+
|
415
|
+
# The current record count
|
416
|
+
attr_accessor :count
|
417
|
+
|
418
|
+
# Initializes the record count to zero.
|
419
|
+
def setup
|
420
|
+
self.count = 0
|
421
|
+
end
|
422
|
+
|
423
|
+
# Select a record only if we've already skipped the first `n`
|
424
|
+
# records.
|
425
|
+
#
|
426
|
+
# @param [Object]
|
427
|
+
# @return [true, false]
|
428
|
+
def select?(record)
|
429
|
+
keep = (@count >= n)
|
430
|
+
@count += 1
|
431
|
+
keep
|
432
|
+
end
|
433
|
+
register
|
434
|
+
end
|
435
|
+
|
310
436
|
end
|
311
437
|
end
|
@@ -22,8 +22,29 @@ module Wukong
|
|
22
22
|
# ... | logger
|
23
23
|
# end
|
24
24
|
class Logger < Processor
|
25
|
-
|
26
|
-
|
25
|
+
field :level, Symbol, :default => :info, :doc => "Log level priority"
|
26
|
+
|
27
|
+
description <<EOF
|
28
|
+
This processor passes all input records unmodified, making a log
|
29
|
+
statement on each one.
|
30
|
+
|
31
|
+
$ cat input
|
32
|
+
1
|
33
|
+
2
|
34
|
+
3
|
35
|
+
$ cat input | wu-local logger
|
36
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 1
|
37
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 2
|
38
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 3
|
39
|
+
|
40
|
+
You can set the priority level of the log messages with the --level
|
41
|
+
flag.
|
42
|
+
|
43
|
+
$ cat input | wu-local logger --level=debug
|
44
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
|
45
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
|
46
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
|
47
|
+
EOF
|
27
48
|
|
28
49
|
# Process a given `record` by logging it.
|
29
50
|
#
|
@@ -100,8 +121,44 @@ module Wukong
|
|
100
121
|
class Extract < Processor
|
101
122
|
include DynamicGet
|
102
123
|
|
103
|
-
|
104
|
-
|
124
|
+
description <<EOF
|
125
|
+
This processor will pass extracted parts of input records.
|
126
|
+
|
127
|
+
It can be used to extract a field from a delimited input
|
128
|
+
|
129
|
+
$ cat input
|
130
|
+
snap crackle pop
|
131
|
+
a b c
|
132
|
+
$ cat input | wu-local extract --part=2
|
133
|
+
crackle
|
134
|
+
b
|
135
|
+
|
136
|
+
The default separator is a tab character but you can specify this as
|
137
|
+
well
|
138
|
+
|
139
|
+
$ cat input
|
140
|
+
snap,crackle,pop
|
141
|
+
a,b,c
|
142
|
+
$ cat input | wu-local extract --part=2 --separator=,
|
143
|
+
crackle
|
144
|
+
b
|
145
|
+
|
146
|
+
It can also be used on JSON records, even those with nested fields
|
147
|
+
|
148
|
+
$ cat input
|
149
|
+
{"id": 1, {"data": {"text": "hi there"}}
|
150
|
+
{"id": 2, {"data": {"text": "goodbye"}}
|
151
|
+
$ cat input | wu-local extract --part=id
|
152
|
+
1
|
153
|
+
2
|
154
|
+
$ cat input | wu-local extract --part=data.text
|
155
|
+
hi there
|
156
|
+
goodbye
|
157
|
+
|
158
|
+
If no --part argument is given, the original record will be yielded.
|
159
|
+
EOF
|
160
|
+
|
161
|
+
field :part, Whatever, :default => nil, :doc => "Part of the record to extract"
|
105
162
|
|
106
163
|
# Extract a `part` of a `record`.
|
107
164
|
#
|
@@ -115,7 +172,9 @@ module Wukong
|
|
115
172
|
end
|
116
173
|
|
117
174
|
class Topic < Processor
|
118
|
-
|
175
|
+
|
176
|
+
field :topic, Symbol, :doc => "Topic to label the record with"
|
177
|
+
|
119
178
|
def process(record)
|
120
179
|
yield perform_action(record)
|
121
180
|
end
|
@@ -12,7 +12,7 @@ module Wukong
|
|
12
12
|
# 0.03480
|
13
13
|
# 0.74418
|
14
14
|
# ...
|
15
|
-
# $ cat input | wu-local bin
|
15
|
+
# $ cat input | wu-local bin --to=tsv
|
16
16
|
#
|
17
17
|
# 0.02935 0.12638500000000003 7
|
18
18
|
# 0.12638500000000003 0.22342000000000004 11
|
@@ -20,7 +20,7 @@ module Wukong
|
|
20
20
|
#
|
21
21
|
# @example Control how the bins are defined and displayed
|
22
22
|
#
|
23
|
-
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1
|
23
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --to=tsv
|
24
24
|
# 0.0 0.1 10.0
|
25
25
|
# 0.1 0.2 12.0
|
26
26
|
# 0.2 0.3 8.0
|
@@ -28,7 +28,7 @@ module Wukong
|
|
28
28
|
#
|
29
29
|
# @example Include an additional column of normalized (fractional) counts
|
30
30
|
#
|
31
|
-
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize
|
31
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize --to=tsv
|
32
32
|
# 0.0 0.1 10.0 0.3
|
33
33
|
# 0.1 0.2 12.0 0.36
|
34
34
|
# 0.2 0.3 8.0 0.24
|
@@ -36,7 +36,7 @@ module Wukong
|
|
36
36
|
#
|
37
37
|
# @example Make a log-log histogram
|
38
38
|
#
|
39
|
-
# $ cat input | wu-local bin --log_bins --log_counts
|
39
|
+
# $ cat input | wu-local bin --log_bins --log_counts --to=tsv
|
40
40
|
# 1.000 3.162 1.099
|
41
41
|
# 3.162 10.000 1.946
|
42
42
|
# 10.000 31.623 3.045
|
@@ -48,29 +48,79 @@ module Wukong
|
|
48
48
|
# @example Use the bin at the end of a dataflow
|
49
49
|
#
|
50
50
|
# Wukong.processor(:bins_at_end) do
|
51
|
-
# ... | extract(part: 'age') | bin(num_bins: 10)
|
51
|
+
# ... | extract(part: 'age') | bin(num_bins: 10) | to_tsv
|
52
52
|
# end
|
53
53
|
#
|
54
54
|
# @see Accumulator
|
55
55
|
# @see Extract
|
56
56
|
class Bin < Accumulator
|
57
|
+
|
58
|
+
description <<EOF
|
59
|
+
This processor can be used to create a set of bins defining the
|
60
|
+
frequency distribution of the input records (or some part of each
|
61
|
+
input record).
|
62
|
+
|
63
|
+
Here's a simple example:
|
64
|
+
|
65
|
+
$ cat input.dat
|
66
|
+
1
|
67
|
+
2
|
68
|
+
3
|
69
|
+
...
|
70
|
+
100
|
71
|
+
|
72
|
+
$ cat input.dat | wu-local bin --to=tsv
|
73
|
+
1.000 10.900 10.000
|
74
|
+
10.900 20.800 10.000
|
75
|
+
20.800 30.700 10.000
|
76
|
+
30.700 40.600 10.000
|
77
|
+
...
|
78
|
+
90.100 100.000 10.000
|
79
|
+
|
80
|
+
By default, all the input values are included and the number of bins
|
81
|
+
used corresponds to the square root of the number of input values.
|
82
|
+
You can customize the domain for the distribution, the number of bins,
|
83
|
+
or the explicit bin edges themselves, via the --min, --max,
|
84
|
+
--num_bins, and --edges flags.
|
85
|
+
|
86
|
+
You can control the display of numbers with the --format_string and
|
87
|
+
--precision options.
|
88
|
+
|
89
|
+
$ cat input.dat | wu-local bin --num_bins=4 --min=0 --max=100 --precision=0 --to=tsv
|
90
|
+
0.0 25 24
|
91
|
+
25 50 25
|
92
|
+
50 75 25
|
93
|
+
75 100 26
|
94
|
+
|
95
|
+
You can use the --log_bins, --log_counts, and --base options to use
|
96
|
+
logarithmically spaced bins or logarithmic counts within each bin to
|
97
|
+
the given base.
|
98
|
+
|
99
|
+
You can also normalize the distribution using the --normalize option.
|
100
|
+
|
101
|
+
$ cat input.dat | wu-local bin --num_bins=4 --log_bins --normalize --to=tsv
|
102
|
+
1.000 3.162 3.000 0.030
|
103
|
+
3.162 10.000 7.000 0.070
|
104
|
+
10.000 31.623 21.000 0.210
|
105
|
+
31.623 100.000 69.000 0.690
|
106
|
+
EOF
|
57
107
|
|
58
|
-
field :num_bins, Integer
|
59
|
-
field :edges, Array
|
60
|
-
field :min, Float
|
61
|
-
field :max, Float
|
108
|
+
field :num_bins, Integer, :doc => "Number of bins to use"
|
109
|
+
field :edges, Array, :doc => "Number of edges to use"
|
110
|
+
field :min, Float, :doc => "Smallest bin starting point"
|
111
|
+
field :max, Float, :doc => "Largest bin ending point"
|
62
112
|
|
63
|
-
field :format_string, String
|
64
|
-
field :precision, Integer, :default => 3
|
113
|
+
field :format_string, String, :doc => "Format string used when printing numerical values"
|
114
|
+
field :precision, Integer, :doc => "Precision used when printing numerical values", :default => 3
|
65
115
|
|
66
116
|
include DynamicGet
|
67
|
-
field :by, Whatever
|
117
|
+
field :by, Whatever, :doc => "Bin the values extracted by this label"
|
68
118
|
|
69
|
-
field :log_bins, :boolean, :default => false
|
70
|
-
field :log_counts, :boolean, :default => false
|
71
|
-
field :base, Float, :default => Math::E
|
119
|
+
field :log_bins, :boolean, :default => false, :doc => "Use logarithmically spaced bins"
|
120
|
+
field :log_counts, :boolean, :default => false, :doc => "Use logarithmic bin counts"
|
121
|
+
field :base, Float, :default => Math::E, :doc => "Base for logarithms"
|
72
122
|
|
73
|
-
field :normalize, :boolean, :default => false
|
123
|
+
field :normalize, :boolean, :default => false, :doc => "Normalize bin counts so they sum to 1.0"
|
74
124
|
|
75
125
|
# The accumulated values
|
76
126
|
attr_accessor :values
|
@@ -148,7 +198,7 @@ module Wukong
|
|
148
198
|
if normalize && total_count > 0
|
149
199
|
bin << log_count_if_necessary((count.to_f / total_count.to_f))
|
150
200
|
end
|
151
|
-
yield bin.map { |n| format(n) }
|
201
|
+
yield bin.map { |n| format(n) }
|
152
202
|
end
|
153
203
|
end
|
154
204
|
|
@@ -169,7 +219,7 @@ module Wukong
|
|
169
219
|
when format_string
|
170
220
|
format_string % n
|
171
221
|
when n == 0.0
|
172
|
-
0.0
|
222
|
+
'0.0'
|
173
223
|
when n.abs > 1000 || n.abs < 0.001
|
174
224
|
"%#{precision}.#{precision}E" % n
|
175
225
|
else
|
@@ -17,6 +17,18 @@ module Wukong
|
|
17
17
|
# 283
|
18
18
|
class Count < Accumulator
|
19
19
|
|
20
|
+
description <<EOF
|
21
|
+
This processor counts the number of input records it receives.
|
22
|
+
|
23
|
+
$ wc -l input
|
24
|
+
283 input
|
25
|
+
$ cat input | wu-local count
|
26
|
+
283
|
27
|
+
|
28
|
+
This processor will not output any records until it receives its final
|
29
|
+
input record.
|
30
|
+
EOF
|
31
|
+
|
20
32
|
# The total size of the input recors.
|
21
33
|
attr_accessor :size
|
22
34
|
|