wukong 3.0.0.pre3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +689 -50
  3. data/bin/wu-local +1 -74
  4. data/diagrams/wu_local.dot +39 -0
  5. data/diagrams/wu_local.dot.png +0 -0
  6. data/examples/loadable.rb +2 -0
  7. data/examples/string_reverser.rb +7 -0
  8. data/lib/hanuman/stage.rb +2 -2
  9. data/lib/wukong.rb +21 -10
  10. data/lib/wukong/dataflow.rb +2 -5
  11. data/lib/wukong/doc_helpers.rb +14 -0
  12. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  13. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  14. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  15. data/lib/wukong/driver.rb +11 -1
  16. data/lib/wukong/local.rb +40 -0
  17. data/lib/wukong/local/event_machine_driver.rb +27 -0
  18. data/lib/wukong/local/runner.rb +98 -0
  19. data/lib/wukong/local/stdio_driver.rb +44 -0
  20. data/lib/wukong/local/tcp_driver.rb +47 -0
  21. data/lib/wukong/logger.rb +16 -7
  22. data/lib/wukong/plugin.rb +48 -0
  23. data/lib/wukong/processor.rb +57 -15
  24. data/lib/wukong/rake_helper.rb +6 -0
  25. data/lib/wukong/runner.rb +151 -128
  26. data/lib/wukong/runner/boot_sequence.rb +123 -0
  27. data/lib/wukong/runner/code_loader.rb +52 -0
  28. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  29. data/lib/wukong/runner/help_message.rb +42 -0
  30. data/lib/wukong/spec_helpers.rb +4 -12
  31. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  32. data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
  33. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  34. data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
  35. data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
  36. data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
  37. data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
  38. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
  39. data/lib/wukong/version.rb +1 -1
  40. data/lib/wukong/widget/filters.rb +134 -8
  41. data/lib/wukong/widget/processors.rb +64 -5
  42. data/lib/wukong/widget/reducers/bin.rb +68 -18
  43. data/lib/wukong/widget/reducers/count.rb +12 -0
  44. data/lib/wukong/widget/reducers/group.rb +48 -5
  45. data/lib/wukong/widget/reducers/group_concat.rb +30 -2
  46. data/lib/wukong/widget/reducers/moments.rb +4 -4
  47. data/lib/wukong/widget/reducers/sort.rb +53 -3
  48. data/lib/wukong/widget/serializers.rb +37 -12
  49. data/lib/wukong/widget/utils.rb +1 -1
  50. data/spec/spec_helper.rb +20 -2
  51. data/spec/wukong/driver_spec.rb +2 -0
  52. data/spec/wukong/local/runner_spec.rb +40 -0
  53. data/spec/wukong/local_spec.rb +6 -0
  54. data/spec/wukong/logger_spec.rb +49 -0
  55. data/spec/wukong/processor_spec.rb +22 -0
  56. data/spec/wukong/runner_spec.rb +128 -8
  57. data/spec/wukong/widget/filters_spec.rb +28 -10
  58. data/spec/wukong/widget/processors_spec.rb +5 -5
  59. data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
  60. data/spec/wukong/widget/reducers/count_spec.rb +1 -1
  61. data/spec/wukong/widget/reducers/group_spec.rb +7 -6
  62. data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
  63. data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
  64. data/spec/wukong/widget/serializers_spec.rb +84 -88
  65. data/spec/wukong/wu-local_spec.rb +109 -0
  66. metadata +43 -20
  67. data/bin/wu-server +0 -70
  68. data/lib/wukong/boot.rb +0 -96
  69. data/lib/wukong/configuration.rb +0 -8
  70. data/lib/wukong/emitter.rb +0 -22
  71. data/lib/wukong/server.rb +0 -119
  72. data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
  73. data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
  74. data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
  75. data/spec/wukong/local_runner_spec.rb +0 -31
  76. data/spec/wukong/wu_local_spec.rb +0 -125
@@ -1,4 +1,4 @@
1
1
  module Wukong
2
2
  # The current version of Wukong.
3
- VERSION = '3.0.0.pre3'
3
+ VERSION = '3.0.0'
4
4
  end
@@ -5,6 +5,10 @@ module Wukong
5
5
  # criterion.
6
6
  class Filter < Processor
7
7
 
8
+ description <<EOF
9
+ A processor which filters input records according to some criterion.
10
+ EOF
11
+
8
12
  # Process a `record` by yielding it only if it should be
9
13
  # selected by this filter.
10
14
  #
@@ -61,6 +65,7 @@ module Wukong
61
65
  # @see Filter
62
66
  # @see Null
63
67
  class Identity < Filter
68
+ description "This processor passes all records unmodified."
64
69
  register
65
70
  end
66
71
 
@@ -84,6 +89,9 @@ module Wukong
84
89
  # @see Filter
85
90
  # @see All
86
91
  class Null < Filter
92
+
93
+ description "This processor acts as a filter which passes no records at all."
94
+
87
95
  # Prevents any records from passing because it always returns
88
96
  # `false`.
89
97
  #
@@ -117,8 +125,21 @@ module Wukong
117
125
  # @see NotRegexpFilter
118
126
  class RegexpFilter < Filter
119
127
 
120
- # The regular expression to use to match records.
121
- field :match, Regexp
128
+ description <<EOF
129
+ This processor only passes records which match against a given regular
130
+ expression.
131
+
132
+ $ cat input
133
+ apple
134
+ banana
135
+ cat
136
+ $ cat input | wu-local regexp --match='^a'
137
+ apple
138
+
139
+ If no --match argument is given, all records will be passed.
140
+ EOF
141
+
142
+ field :match, Regexp, :doc => "Regular expression to match against"
122
143
 
123
144
  # Selects a `record` only if it matches this widget's `match`
124
145
  # field.
@@ -154,6 +175,22 @@ module Wukong
154
175
  # @see Filter
155
176
  # @see NotRegexpFilter
156
177
  class NotRegexpFilter < RegexpFilter
178
+
179
+ description <<EOF
180
+ This processor only passes records which fail to match against a given
181
+ regular expression.
182
+
183
+ $ cat input
184
+ apple
185
+ banana
186
+ cat
187
+ $ cat input | wu-local not_regexp --match='^a'
188
+ banana
189
+ cat
190
+
191
+ If no --match argument is given, all records will be passed.
192
+ EOF
193
+
157
194
  # Select a `record` only if it <b>doesn't</b> match this
158
195
  # widget's `match` field.
159
196
  #
@@ -189,8 +226,22 @@ module Wukong
189
226
  # @see Filter
190
227
  class Limit < Filter
191
228
 
192
- # The maximum number of records to let pass.
193
- field :max, Integer, :default => Float::INFINITY
229
+ description <<EOF
230
+ This processor passes a certain number of records and then stops
231
+ passing any, acting as a limit.
232
+
233
+ $ cat input
234
+ 1
235
+ 2
236
+ 3
237
+ $ cat input | wu-local limit --max=2
238
+ 1
239
+ 2
240
+
241
+ If no --max argument is given, all records will be passed.
242
+ EOF
243
+
244
+ field :max, Integer, :doc => "Maximum number of records to let pass"
194
245
 
195
246
  # The current record count.
196
247
  attr_accessor :count
@@ -206,7 +257,7 @@ module Wukong
206
257
  # @param [Object] record
207
258
  # @return [true, false]
208
259
  def select?(record)
209
- keep = @count < max
260
+ keep = (max ? @count < max : true)
210
261
  @count += 1
211
262
  keep
212
263
  end
@@ -236,9 +287,23 @@ module Wukong
236
287
  # @see Limit
237
288
  class Sample < Filter
238
289
 
239
- # The fraction of records to let pass. Must be between 0.0 and
240
- # 10.0
241
- field :fraction, Float, :default => 1.0
290
+ description <<EOF
291
+ This processor will pass input records with a certain frequency,
292
+ acting as a random sampler.
293
+
294
+ $ cat input
295
+ 1
296
+ 2
297
+ 3
298
+ 4
299
+ $ cat input | wu-local sample --fraction=0.5
300
+ 1
301
+ 4
302
+
303
+ If no --fraction is given, all records will be passed.
304
+ EOF
305
+
306
+ field :fraction, Float, :default => 1.0, :doc => "Fraction of records to let pass. Must be between 0 and 1.0"
242
307
 
243
308
  # Selects a `record` randomly, with a probability given the the
244
309
  # `fraction` for this widget.
@@ -307,5 +372,66 @@ module Wukong
307
372
  register
308
373
  end
309
374
 
375
+ # Emit only the first `n` records.
376
+ #
377
+ # @see Filter
378
+ class Head < Filter
379
+
380
+ field :n, Integer, :default => 10, :doc => "Number of records to let pass"
381
+
382
+ # The current record count.
383
+ attr_accessor :count
384
+
385
+ # Initializes the record count to zero.
386
+ def setup
387
+ self.count = 0
388
+ end
389
+
390
+ # Select a record only if we're below the maximum number of
391
+ # records.
392
+ #
393
+ # @param [Object] record
394
+ # @return [true, false]
395
+ def select?(record)
396
+ keep = @count < n
397
+ @count += 1
398
+ keep
399
+ end
400
+ register
401
+ end
402
+
403
+ # Skip the first `n` records.
404
+ #
405
+ # Works slightly differently than the UNIX `tail` command which
406
+ # prints the last `n` records. This notion is less useful in a
407
+ # streaming context, so think of this filter as the equivalent of
408
+ # `tail -n+`.
409
+ #
410
+ # @see Filter
411
+ class Tail < Filter
412
+
413
+ field :n, Integer, :default => 0, :doc => "Number of records to skip before letting records pass"
414
+
415
+ # The current record count
416
+ attr_accessor :count
417
+
418
+ # Initializes the record count to zero.
419
+ def setup
420
+ self.count = 0
421
+ end
422
+
423
+ # Select a record only if we've already skipped the first `n`
424
+ # records.
425
+ #
426
+ # @param [Object]
427
+ # @return [true, false]
428
+ def select?(record)
429
+ keep = (@count >= n)
430
+ @count += 1
431
+ keep
432
+ end
433
+ register
434
+ end
435
+
310
436
  end
311
437
  end
@@ -22,8 +22,29 @@ module Wukong
22
22
  # ... | logger
23
23
  # end
24
24
  class Logger < Processor
25
- # The level to use for logging.
26
- field :level, Symbol, :default => :info
25
+ field :level, Symbol, :default => :info, :doc => "Log level priority"
26
+
27
+ description <<EOF
28
+ This processor passes all input records unmodified, making a log
29
+ statement on each one.
30
+
31
+ $ cat input
32
+ 1
33
+ 2
34
+ 3
35
+ $ cat input | wu-local logger
36
+ INFO 2013-01-04 17:10:59 [Logger ] -- 1
37
+ INFO 2013-01-04 17:10:59 [Logger ] -- 2
38
+ INFO 2013-01-04 17:10:59 [Logger ] -- 3
39
+
40
+ You can set the priority level of the log messages with the --level
41
+ flag.
42
+
43
+ $ cat input | wu-local logger --level=debug
44
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
45
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
46
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
47
+ EOF
27
48
 
28
49
  # Process a given `record` by logging it.
29
50
  #
@@ -100,8 +121,44 @@ module Wukong
100
121
  class Extract < Processor
101
122
  include DynamicGet
102
123
 
103
- # The part to extract.
104
- field :part, Whatever, :default => nil
124
+ description <<EOF
125
+ This processor will pass extracted parts of input records.
126
+
127
+ It can be used to extract a field from a delimited input
128
+
129
+ $ cat input
130
+ snap crackle pop
131
+ a b c
132
+ $ cat input | wu-local extract --part=2
133
+ crackle
134
+ b
135
+
136
+ The default separator is a tab character but you can specify this as
137
+ well
138
+
139
+ $ cat input
140
+ snap,crackle,pop
141
+ a,b,c
142
+ $ cat input | wu-local extract --part=2 --separator=,
143
+ crackle
144
+ b
145
+
146
+ It can also be used on JSON records, even those with nested fields
147
+
148
+ $ cat input
149
+ {"id": 1, {"data": {"text": "hi there"}}
150
+ {"id": 2, {"data": {"text": "goodbye"}}
151
+ $ cat input | wu-local extract --part=id
152
+ 1
153
+ 2
154
+ $ cat input | wu-local extract --part=data.text
155
+ hi there
156
+ goodbye
157
+
158
+ If no --part argument is given, the original record will be yielded.
159
+ EOF
160
+
161
+ field :part, Whatever, :default => nil, :doc => "Part of the record to extract"
105
162
 
106
163
  # Extract a `part` of a `record`.
107
164
  #
@@ -115,7 +172,9 @@ module Wukong
115
172
  end
116
173
 
117
174
  class Topic < Processor
118
- field :topic, Symbol
175
+
176
+ field :topic, Symbol, :doc => "Topic to label the record with"
177
+
119
178
  def process(record)
120
179
  yield perform_action(record)
121
180
  end
@@ -12,7 +12,7 @@ module Wukong
12
12
  # 0.03480
13
13
  # 0.74418
14
14
  # ...
15
- # $ cat input | wu-local bin
15
+ # $ cat input | wu-local bin --to=tsv
16
16
  #
17
17
  # 0.02935 0.12638500000000003 7
18
18
  # 0.12638500000000003 0.22342000000000004 11
@@ -20,7 +20,7 @@ module Wukong
20
20
  #
21
21
  # @example Control how the bins are defined and displayed
22
22
  #
23
- # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1
23
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --to=tsv
24
24
  # 0.0 0.1 10.0
25
25
  # 0.1 0.2 12.0
26
26
  # 0.2 0.3 8.0
@@ -28,7 +28,7 @@ module Wukong
28
28
  #
29
29
  # @example Include an additional column of normalized (fractional) counts
30
30
  #
31
- # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize
31
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize --to=tsv
32
32
  # 0.0 0.1 10.0 0.3
33
33
  # 0.1 0.2 12.0 0.36
34
34
  # 0.2 0.3 8.0 0.24
@@ -36,7 +36,7 @@ module Wukong
36
36
  #
37
37
  # @example Make a log-log histogram
38
38
  #
39
- # $ cat input | wu-local bin --log_bins --log_counts
39
+ # $ cat input | wu-local bin --log_bins --log_counts --to=tsv
40
40
  # 1.000 3.162 1.099
41
41
  # 3.162 10.000 1.946
42
42
  # 10.000 31.623 3.045
@@ -48,29 +48,79 @@ module Wukong
48
48
  # @example Use the bin at the end of a dataflow
49
49
  #
50
50
  # Wukong.processor(:bins_at_end) do
51
- # ... | extract(part: 'age') | bin(num_bins: 10)
51
+ # ... | extract(part: 'age') | bin(num_bins: 10) | to_tsv
52
52
  # end
53
53
  #
54
54
  # @see Accumulator
55
55
  # @see Extract
56
56
  class Bin < Accumulator
57
+
58
+ description <<EOF
59
+ This processor can be used to create a set of bins defining the
60
+ frequency distribution of the input records (or some part of each
61
+ input record).
62
+
63
+ Here's a simple example:
64
+
65
+ $ cat input.dat
66
+ 1
67
+ 2
68
+ 3
69
+ ...
70
+ 100
71
+
72
+ $ cat input.dat | wu-local bin --to=tsv
73
+ 1.000 10.900 10.000
74
+ 10.900 20.800 10.000
75
+ 20.800 30.700 10.000
76
+ 30.700 40.600 10.000
77
+ ...
78
+ 90.100 100.000 10.000
79
+
80
+ By default, all the input values are included and the number of bins
81
+ used corresponds to the square root of the number of input values.
82
+ You can customize the domain for the distribution, the number of bins,
83
+ or the explicit bin edges themselves, via the --min, --max,
84
+ --num_bins, and --edges flags.
85
+
86
+ You can control the display of numbers with the --format_string and
87
+ --precision options.
88
+
89
+ $ cat input.dat | wu-local bin --num_bins=4 --min=0 --max=100 --precision=0 --to=tsv
90
+ 0.0 25 24
91
+ 25 50 25
92
+ 50 75 25
93
+ 75 100 26
94
+
95
+ You can use the --log_bins, --log_counts, and --base options to use
96
+ logarithmically spaced bins or logarithmic counts within each bin to
97
+ the given base.
98
+
99
+ You can also normalize the distribution using the --normalize option.
100
+
101
+ $ cat input.dat | wu-local bin --num_bins=4 --log_bins --normalize --to=tsv
102
+ 1.000 3.162 3.000 0.030
103
+ 3.162 10.000 7.000 0.070
104
+ 10.000 31.623 21.000 0.210
105
+ 31.623 100.000 69.000 0.690
106
+ EOF
57
107
 
58
- field :num_bins, Integer
59
- field :edges, Array
60
- field :min, Float
61
- field :max, Float
108
+ field :num_bins, Integer, :doc => "Number of bins to use"
109
+ field :edges, Array, :doc => "Number of edges to use"
110
+ field :min, Float, :doc => "Smallest bin starting point"
111
+ field :max, Float, :doc => "Largest bin ending point"
62
112
 
63
- field :format_string, String
64
- field :precision, Integer, :default => 3
113
+ field :format_string, String, :doc => "Format string used when printing numerical values"
114
+ field :precision, Integer, :doc => "Precision used when printing numerical values", :default => 3
65
115
 
66
116
  include DynamicGet
67
- field :by, Whatever
117
+ field :by, Whatever, :doc => "Bin the values extracted by this label"
68
118
 
69
- field :log_bins, :boolean, :default => false
70
- field :log_counts, :boolean, :default => false
71
- field :base, Float, :default => Math::E
119
+ field :log_bins, :boolean, :default => false, :doc => "Use logarithmically spaced bins"
120
+ field :log_counts, :boolean, :default => false, :doc => "Use logarithmic bin counts"
121
+ field :base, Float, :default => Math::E, :doc => "Base for logarithms"
72
122
 
73
- field :normalize, :boolean, :default => false
123
+ field :normalize, :boolean, :default => false, :doc => "Normalize bin counts so they sum to 1.0"
74
124
 
75
125
  # The accumulated values
76
126
  attr_accessor :values
@@ -148,7 +198,7 @@ module Wukong
148
198
  if normalize && total_count > 0
149
199
  bin << log_count_if_necessary((count.to_f / total_count.to_f))
150
200
  end
151
- yield bin.map { |n| format(n) }.join("\t")
201
+ yield bin.map { |n| format(n) }
152
202
  end
153
203
  end
154
204
 
@@ -169,7 +219,7 @@ module Wukong
169
219
  when format_string
170
220
  format_string % n
171
221
  when n == 0.0
172
- 0.0
222
+ '0.0'
173
223
  when n.abs > 1000 || n.abs < 0.001
174
224
  "%#{precision}.#{precision}E" % n
175
225
  else
@@ -17,6 +17,18 @@ module Wukong
17
17
  # 283
18
18
  class Count < Accumulator
19
19
 
20
+ description <<EOF
21
+ This processor counts the number of input records it receives.
22
+
23
+ $ wc -l input
24
+ 283 input
25
+ $ cat input | wu-local count
26
+ 283
27
+
28
+ This processor will not output any records until it receives its final
29
+ input record.
30
+ EOF
31
+
20
32
  # The total size of the input recors.
21
33
  attr_accessor :size
22
34