wukong 3.0.0.pre3 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/README.md +689 -50
- data/bin/wu-local +1 -74
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/loadable.rb +2 -0
- data/examples/string_reverser.rb +7 -0
- data/lib/hanuman/stage.rb +2 -2
- data/lib/wukong.rb +21 -10
- data/lib/wukong/dataflow.rb +2 -5
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +11 -1
- data/lib/wukong/local.rb +40 -0
- data/lib/wukong/local/event_machine_driver.rb +27 -0
- data/lib/wukong/local/runner.rb +98 -0
- data/lib/wukong/local/stdio_driver.rb +44 -0
- data/lib/wukong/local/tcp_driver.rb +47 -0
- data/lib/wukong/logger.rb +16 -7
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +57 -15
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +151 -128
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/spec_helpers.rb +4 -12
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
- data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
- data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
- data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/filters.rb +134 -8
- data/lib/wukong/widget/processors.rb +64 -5
- data/lib/wukong/widget/reducers/bin.rb +68 -18
- data/lib/wukong/widget/reducers/count.rb +12 -0
- data/lib/wukong/widget/reducers/group.rb +48 -5
- data/lib/wukong/widget/reducers/group_concat.rb +30 -2
- data/lib/wukong/widget/reducers/moments.rb +4 -4
- data/lib/wukong/widget/reducers/sort.rb +53 -3
- data/lib/wukong/widget/serializers.rb +37 -12
- data/lib/wukong/widget/utils.rb +1 -1
- data/spec/spec_helper.rb +20 -2
- data/spec/wukong/driver_spec.rb +2 -0
- data/spec/wukong/local/runner_spec.rb +40 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/processor_spec.rb +22 -0
- data/spec/wukong/runner_spec.rb +128 -8
- data/spec/wukong/widget/filters_spec.rb +28 -10
- data/spec/wukong/widget/processors_spec.rb +5 -5
- data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
- data/spec/wukong/widget/reducers/count_spec.rb +1 -1
- data/spec/wukong/widget/reducers/group_spec.rb +7 -6
- data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
- data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
- data/spec/wukong/widget/serializers_spec.rb +84 -88
- data/spec/wukong/wu-local_spec.rb +109 -0
- metadata +43 -20
- data/bin/wu-server +0 -70
- data/lib/wukong/boot.rb +0 -96
- data/lib/wukong/configuration.rb +0 -8
- data/lib/wukong/emitter.rb +0 -22
- data/lib/wukong/server.rb +0 -119
- data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
- data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
- data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
- data/spec/wukong/local_runner_spec.rb +0 -31
- data/spec/wukong/wu_local_spec.rb +0 -125
data/lib/wukong/version.rb
CHANGED
@@ -5,6 +5,10 @@ module Wukong
|
|
5
5
|
# criterion.
|
6
6
|
class Filter < Processor
|
7
7
|
|
8
|
+
description <<EOF
|
9
|
+
A processor which filters input records according to some criterion.
|
10
|
+
EOF
|
11
|
+
|
8
12
|
# Process a `record` by yielding it only if it should be
|
9
13
|
# selected by this filter.
|
10
14
|
#
|
@@ -61,6 +65,7 @@ module Wukong
|
|
61
65
|
# @see Filter
|
62
66
|
# @see Null
|
63
67
|
class Identity < Filter
|
68
|
+
description "This processor passes all records unmodified."
|
64
69
|
register
|
65
70
|
end
|
66
71
|
|
@@ -84,6 +89,9 @@ module Wukong
|
|
84
89
|
# @see Filter
|
85
90
|
# @see All
|
86
91
|
class Null < Filter
|
92
|
+
|
93
|
+
description "This processor acts as a filter which passes no records at all."
|
94
|
+
|
87
95
|
# Prevents any records from passing because it always returns
|
88
96
|
# `false`.
|
89
97
|
#
|
@@ -117,8 +125,21 @@ module Wukong
|
|
117
125
|
# @see NotRegexpFilter
|
118
126
|
class RegexpFilter < Filter
|
119
127
|
|
120
|
-
|
121
|
-
|
128
|
+
description <<EOF
|
129
|
+
This processor only passes records which match against a given regular
|
130
|
+
expression.
|
131
|
+
|
132
|
+
$ cat input
|
133
|
+
apple
|
134
|
+
banana
|
135
|
+
cat
|
136
|
+
$ cat input | wu-local regexp --match='^a'
|
137
|
+
apple
|
138
|
+
|
139
|
+
If no --match argument is given, all records will be passed.
|
140
|
+
EOF
|
141
|
+
|
142
|
+
field :match, Regexp, :doc => "Regular expression to match against"
|
122
143
|
|
123
144
|
# Selects a `record` only if it matches this widget's `match`
|
124
145
|
# field.
|
@@ -154,6 +175,22 @@ module Wukong
|
|
154
175
|
# @see Filter
|
155
176
|
# @see NotRegexpFilter
|
156
177
|
class NotRegexpFilter < RegexpFilter
|
178
|
+
|
179
|
+
description <<EOF
|
180
|
+
This processor only passes records which fail to match against a given
|
181
|
+
regular expression.
|
182
|
+
|
183
|
+
$ cat input
|
184
|
+
apple
|
185
|
+
banana
|
186
|
+
cat
|
187
|
+
$ cat input | wu-local not_regexp --match='^a'
|
188
|
+
banana
|
189
|
+
cat
|
190
|
+
|
191
|
+
If no --match argument is given, all records will be passed.
|
192
|
+
EOF
|
193
|
+
|
157
194
|
# Select a `record` only if it <b>doesn't</b> match this
|
158
195
|
# widget's `match` field.
|
159
196
|
#
|
@@ -189,8 +226,22 @@ module Wukong
|
|
189
226
|
# @see Filter
|
190
227
|
class Limit < Filter
|
191
228
|
|
192
|
-
|
193
|
-
|
229
|
+
description <<EOF
|
230
|
+
This processor passes a certain number of records and then stops
|
231
|
+
passing any, acting as a limit.
|
232
|
+
|
233
|
+
$ cat input
|
234
|
+
1
|
235
|
+
2
|
236
|
+
3
|
237
|
+
$ cat input | wu-local limit --max=2
|
238
|
+
1
|
239
|
+
2
|
240
|
+
|
241
|
+
If no --max argument is given, all records will be passed.
|
242
|
+
EOF
|
243
|
+
|
244
|
+
field :max, Integer, :doc => "Maximum number of records to let pass"
|
194
245
|
|
195
246
|
# The current record count.
|
196
247
|
attr_accessor :count
|
@@ -206,7 +257,7 @@ module Wukong
|
|
206
257
|
# @param [Object] record
|
207
258
|
# @return [true, false]
|
208
259
|
def select?(record)
|
209
|
-
keep = @count < max
|
260
|
+
keep = (max ? @count < max : true)
|
210
261
|
@count += 1
|
211
262
|
keep
|
212
263
|
end
|
@@ -236,9 +287,23 @@ module Wukong
|
|
236
287
|
# @see Limit
|
237
288
|
class Sample < Filter
|
238
289
|
|
239
|
-
|
240
|
-
|
241
|
-
|
290
|
+
description <<EOF
|
291
|
+
This processor will pass input records with a certain frequency,
|
292
|
+
acting as a random sampler.
|
293
|
+
|
294
|
+
$ cat input
|
295
|
+
1
|
296
|
+
2
|
297
|
+
3
|
298
|
+
4
|
299
|
+
$ cat input | wu-local sample --fraction=0.5
|
300
|
+
1
|
301
|
+
4
|
302
|
+
|
303
|
+
If no --fraction is given, all records will be passed.
|
304
|
+
EOF
|
305
|
+
|
306
|
+
field :fraction, Float, :default => 1.0, :doc => "Fraction of records to let pass. Must be between 0 and 1.0"
|
242
307
|
|
243
308
|
# Selects a `record` randomly, with a probability given the the
|
244
309
|
# `fraction` for this widget.
|
@@ -307,5 +372,66 @@ module Wukong
|
|
307
372
|
register
|
308
373
|
end
|
309
374
|
|
375
|
+
# Emit only the first `n` records.
|
376
|
+
#
|
377
|
+
# @see Filter
|
378
|
+
class Head < Filter
|
379
|
+
|
380
|
+
field :n, Integer, :default => 10, :doc => "Number of records to let pass"
|
381
|
+
|
382
|
+
# The current record count.
|
383
|
+
attr_accessor :count
|
384
|
+
|
385
|
+
# Initializes the record count to zero.
|
386
|
+
def setup
|
387
|
+
self.count = 0
|
388
|
+
end
|
389
|
+
|
390
|
+
# Select a record only if we're below the maximum number of
|
391
|
+
# records.
|
392
|
+
#
|
393
|
+
# @param [Object] record
|
394
|
+
# @return [true, false]
|
395
|
+
def select?(record)
|
396
|
+
keep = @count < n
|
397
|
+
@count += 1
|
398
|
+
keep
|
399
|
+
end
|
400
|
+
register
|
401
|
+
end
|
402
|
+
|
403
|
+
# Skip the first `n` records.
|
404
|
+
#
|
405
|
+
# Works slightly differently than the UNIX `tail` command which
|
406
|
+
# prints the last `n` records. This notion is less useful in a
|
407
|
+
# streaming context, so think of this filter as the equivalent of
|
408
|
+
# `tail -n+`.
|
409
|
+
#
|
410
|
+
# @see Filter
|
411
|
+
class Tail < Filter
|
412
|
+
|
413
|
+
field :n, Integer, :default => 0, :doc => "Number of records to skip before letting records pass"
|
414
|
+
|
415
|
+
# The current record count
|
416
|
+
attr_accessor :count
|
417
|
+
|
418
|
+
# Initializes the record count to zero.
|
419
|
+
def setup
|
420
|
+
self.count = 0
|
421
|
+
end
|
422
|
+
|
423
|
+
# Select a record only if we've already skipped the first `n`
|
424
|
+
# records.
|
425
|
+
#
|
426
|
+
# @param [Object]
|
427
|
+
# @return [true, false]
|
428
|
+
def select?(record)
|
429
|
+
keep = (@count >= n)
|
430
|
+
@count += 1
|
431
|
+
keep
|
432
|
+
end
|
433
|
+
register
|
434
|
+
end
|
435
|
+
|
310
436
|
end
|
311
437
|
end
|
@@ -22,8 +22,29 @@ module Wukong
|
|
22
22
|
# ... | logger
|
23
23
|
# end
|
24
24
|
class Logger < Processor
|
25
|
-
|
26
|
-
|
25
|
+
field :level, Symbol, :default => :info, :doc => "Log level priority"
|
26
|
+
|
27
|
+
description <<EOF
|
28
|
+
This processor passes all input records unmodified, making a log
|
29
|
+
statement on each one.
|
30
|
+
|
31
|
+
$ cat input
|
32
|
+
1
|
33
|
+
2
|
34
|
+
3
|
35
|
+
$ cat input | wu-local logger
|
36
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 1
|
37
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 2
|
38
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 3
|
39
|
+
|
40
|
+
You can set the priority level of the log messages with the --level
|
41
|
+
flag.
|
42
|
+
|
43
|
+
$ cat input | wu-local logger --level=debug
|
44
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
|
45
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
|
46
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
|
47
|
+
EOF
|
27
48
|
|
28
49
|
# Process a given `record` by logging it.
|
29
50
|
#
|
@@ -100,8 +121,44 @@ module Wukong
|
|
100
121
|
class Extract < Processor
|
101
122
|
include DynamicGet
|
102
123
|
|
103
|
-
|
104
|
-
|
124
|
+
description <<EOF
|
125
|
+
This processor will pass extracted parts of input records.
|
126
|
+
|
127
|
+
It can be used to extract a field from a delimited input
|
128
|
+
|
129
|
+
$ cat input
|
130
|
+
snap crackle pop
|
131
|
+
a b c
|
132
|
+
$ cat input | wu-local extract --part=2
|
133
|
+
crackle
|
134
|
+
b
|
135
|
+
|
136
|
+
The default separator is a tab character but you can specify this as
|
137
|
+
well
|
138
|
+
|
139
|
+
$ cat input
|
140
|
+
snap,crackle,pop
|
141
|
+
a,b,c
|
142
|
+
$ cat input | wu-local extract --part=2 --separator=,
|
143
|
+
crackle
|
144
|
+
b
|
145
|
+
|
146
|
+
It can also be used on JSON records, even those with nested fields
|
147
|
+
|
148
|
+
$ cat input
|
149
|
+
{"id": 1, {"data": {"text": "hi there"}}
|
150
|
+
{"id": 2, {"data": {"text": "goodbye"}}
|
151
|
+
$ cat input | wu-local extract --part=id
|
152
|
+
1
|
153
|
+
2
|
154
|
+
$ cat input | wu-local extract --part=data.text
|
155
|
+
hi there
|
156
|
+
goodbye
|
157
|
+
|
158
|
+
If no --part argument is given, the original record will be yielded.
|
159
|
+
EOF
|
160
|
+
|
161
|
+
field :part, Whatever, :default => nil, :doc => "Part of the record to extract"
|
105
162
|
|
106
163
|
# Extract a `part` of a `record`.
|
107
164
|
#
|
@@ -115,7 +172,9 @@ module Wukong
|
|
115
172
|
end
|
116
173
|
|
117
174
|
class Topic < Processor
|
118
|
-
|
175
|
+
|
176
|
+
field :topic, Symbol, :doc => "Topic to label the record with"
|
177
|
+
|
119
178
|
def process(record)
|
120
179
|
yield perform_action(record)
|
121
180
|
end
|
@@ -12,7 +12,7 @@ module Wukong
|
|
12
12
|
# 0.03480
|
13
13
|
# 0.74418
|
14
14
|
# ...
|
15
|
-
# $ cat input | wu-local bin
|
15
|
+
# $ cat input | wu-local bin --to=tsv
|
16
16
|
#
|
17
17
|
# 0.02935 0.12638500000000003 7
|
18
18
|
# 0.12638500000000003 0.22342000000000004 11
|
@@ -20,7 +20,7 @@ module Wukong
|
|
20
20
|
#
|
21
21
|
# @example Control how the bins are defined and displayed
|
22
22
|
#
|
23
|
-
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1
|
23
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --to=tsv
|
24
24
|
# 0.0 0.1 10.0
|
25
25
|
# 0.1 0.2 12.0
|
26
26
|
# 0.2 0.3 8.0
|
@@ -28,7 +28,7 @@ module Wukong
|
|
28
28
|
#
|
29
29
|
# @example Include an additional column of normalized (fractional) counts
|
30
30
|
#
|
31
|
-
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize
|
31
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize --to=tsv
|
32
32
|
# 0.0 0.1 10.0 0.3
|
33
33
|
# 0.1 0.2 12.0 0.36
|
34
34
|
# 0.2 0.3 8.0 0.24
|
@@ -36,7 +36,7 @@ module Wukong
|
|
36
36
|
#
|
37
37
|
# @example Make a log-log histogram
|
38
38
|
#
|
39
|
-
# $ cat input | wu-local bin --log_bins --log_counts
|
39
|
+
# $ cat input | wu-local bin --log_bins --log_counts --to=tsv
|
40
40
|
# 1.000 3.162 1.099
|
41
41
|
# 3.162 10.000 1.946
|
42
42
|
# 10.000 31.623 3.045
|
@@ -48,29 +48,79 @@ module Wukong
|
|
48
48
|
# @example Use the bin at the end of a dataflow
|
49
49
|
#
|
50
50
|
# Wukong.processor(:bins_at_end) do
|
51
|
-
# ... | extract(part: 'age') | bin(num_bins: 10)
|
51
|
+
# ... | extract(part: 'age') | bin(num_bins: 10) | to_tsv
|
52
52
|
# end
|
53
53
|
#
|
54
54
|
# @see Accumulator
|
55
55
|
# @see Extract
|
56
56
|
class Bin < Accumulator
|
57
|
+
|
58
|
+
description <<EOF
|
59
|
+
This processor can be used to create a set of bins defining the
|
60
|
+
frequency distribution of the input records (or some part of each
|
61
|
+
input record).
|
62
|
+
|
63
|
+
Here's a simple example:
|
64
|
+
|
65
|
+
$ cat input.dat
|
66
|
+
1
|
67
|
+
2
|
68
|
+
3
|
69
|
+
...
|
70
|
+
100
|
71
|
+
|
72
|
+
$ cat input.dat | wu-local bin --to=tsv
|
73
|
+
1.000 10.900 10.000
|
74
|
+
10.900 20.800 10.000
|
75
|
+
20.800 30.700 10.000
|
76
|
+
30.700 40.600 10.000
|
77
|
+
...
|
78
|
+
90.100 100.000 10.000
|
79
|
+
|
80
|
+
By default, all the input values are included and the number of bins
|
81
|
+
used corresponds to the square root of the number of input values.
|
82
|
+
You can customize the domain for the distribution, the number of bins,
|
83
|
+
or the explicit bin edges themselves, via the --min, --max,
|
84
|
+
--num_bins, and --edges flags.
|
85
|
+
|
86
|
+
You can control the display of numbers with the --format_string and
|
87
|
+
--precision options.
|
88
|
+
|
89
|
+
$ cat input.dat | wu-local bin --num_bins=4 --min=0 --max=100 --precision=0 --to=tsv
|
90
|
+
0.0 25 24
|
91
|
+
25 50 25
|
92
|
+
50 75 25
|
93
|
+
75 100 26
|
94
|
+
|
95
|
+
You can use the --log_bins, --log_counts, and --base options to use
|
96
|
+
logarithmically spaced bins or logarithmic counts within each bin to
|
97
|
+
the given base.
|
98
|
+
|
99
|
+
You can also normalize the distribution using the --normalize option.
|
100
|
+
|
101
|
+
$ cat input.dat | wu-local bin --num_bins=4 --log_bins --normalize --to=tsv
|
102
|
+
1.000 3.162 3.000 0.030
|
103
|
+
3.162 10.000 7.000 0.070
|
104
|
+
10.000 31.623 21.000 0.210
|
105
|
+
31.623 100.000 69.000 0.690
|
106
|
+
EOF
|
57
107
|
|
58
|
-
field :num_bins, Integer
|
59
|
-
field :edges, Array
|
60
|
-
field :min, Float
|
61
|
-
field :max, Float
|
108
|
+
field :num_bins, Integer, :doc => "Number of bins to use"
|
109
|
+
field :edges, Array, :doc => "Number of edges to use"
|
110
|
+
field :min, Float, :doc => "Smallest bin starting point"
|
111
|
+
field :max, Float, :doc => "Largest bin ending point"
|
62
112
|
|
63
|
-
field :format_string, String
|
64
|
-
field :precision, Integer, :default => 3
|
113
|
+
field :format_string, String, :doc => "Format string used when printing numerical values"
|
114
|
+
field :precision, Integer, :doc => "Precision used when printing numerical values", :default => 3
|
65
115
|
|
66
116
|
include DynamicGet
|
67
|
-
field :by, Whatever
|
117
|
+
field :by, Whatever, :doc => "Bin the values extracted by this label"
|
68
118
|
|
69
|
-
field :log_bins, :boolean, :default => false
|
70
|
-
field :log_counts, :boolean, :default => false
|
71
|
-
field :base, Float, :default => Math::E
|
119
|
+
field :log_bins, :boolean, :default => false, :doc => "Use logarithmically spaced bins"
|
120
|
+
field :log_counts, :boolean, :default => false, :doc => "Use logarithmic bin counts"
|
121
|
+
field :base, Float, :default => Math::E, :doc => "Base for logarithms"
|
72
122
|
|
73
|
-
field :normalize, :boolean, :default => false
|
123
|
+
field :normalize, :boolean, :default => false, :doc => "Normalize bin counts so they sum to 1.0"
|
74
124
|
|
75
125
|
# The accumulated values
|
76
126
|
attr_accessor :values
|
@@ -148,7 +198,7 @@ module Wukong
|
|
148
198
|
if normalize && total_count > 0
|
149
199
|
bin << log_count_if_necessary((count.to_f / total_count.to_f))
|
150
200
|
end
|
151
|
-
yield bin.map { |n| format(n) }
|
201
|
+
yield bin.map { |n| format(n) }
|
152
202
|
end
|
153
203
|
end
|
154
204
|
|
@@ -169,7 +219,7 @@ module Wukong
|
|
169
219
|
when format_string
|
170
220
|
format_string % n
|
171
221
|
when n == 0.0
|
172
|
-
0.0
|
222
|
+
'0.0'
|
173
223
|
when n.abs > 1000 || n.abs < 0.001
|
174
224
|
"%#{precision}.#{precision}E" % n
|
175
225
|
else
|
@@ -17,6 +17,18 @@ module Wukong
|
|
17
17
|
# 283
|
18
18
|
class Count < Accumulator
|
19
19
|
|
20
|
+
description <<EOF
|
21
|
+
This processor counts the number of input records it receives.
|
22
|
+
|
23
|
+
$ wc -l input
|
24
|
+
283 input
|
25
|
+
$ cat input | wu-local count
|
26
|
+
283
|
27
|
+
|
28
|
+
This processor will not output any records until it receives its final
|
29
|
+
input record.
|
30
|
+
EOF
|
31
|
+
|
20
32
|
# The total size of the input recors.
|
21
33
|
attr_accessor :size
|
22
34
|
|