wukong 3.0.0.pre3 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +689 -50
  3. data/bin/wu-local +1 -74
  4. data/diagrams/wu_local.dot +39 -0
  5. data/diagrams/wu_local.dot.png +0 -0
  6. data/examples/loadable.rb +2 -0
  7. data/examples/string_reverser.rb +7 -0
  8. data/lib/hanuman/stage.rb +2 -2
  9. data/lib/wukong.rb +21 -10
  10. data/lib/wukong/dataflow.rb +2 -5
  11. data/lib/wukong/doc_helpers.rb +14 -0
  12. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  13. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  14. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  15. data/lib/wukong/driver.rb +11 -1
  16. data/lib/wukong/local.rb +40 -0
  17. data/lib/wukong/local/event_machine_driver.rb +27 -0
  18. data/lib/wukong/local/runner.rb +98 -0
  19. data/lib/wukong/local/stdio_driver.rb +44 -0
  20. data/lib/wukong/local/tcp_driver.rb +47 -0
  21. data/lib/wukong/logger.rb +16 -7
  22. data/lib/wukong/plugin.rb +48 -0
  23. data/lib/wukong/processor.rb +57 -15
  24. data/lib/wukong/rake_helper.rb +6 -0
  25. data/lib/wukong/runner.rb +151 -128
  26. data/lib/wukong/runner/boot_sequence.rb +123 -0
  27. data/lib/wukong/runner/code_loader.rb +52 -0
  28. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  29. data/lib/wukong/runner/help_message.rb +42 -0
  30. data/lib/wukong/spec_helpers.rb +4 -12
  31. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  32. data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
  33. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  34. data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
  35. data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
  36. data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
  37. data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
  38. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
  39. data/lib/wukong/version.rb +1 -1
  40. data/lib/wukong/widget/filters.rb +134 -8
  41. data/lib/wukong/widget/processors.rb +64 -5
  42. data/lib/wukong/widget/reducers/bin.rb +68 -18
  43. data/lib/wukong/widget/reducers/count.rb +12 -0
  44. data/lib/wukong/widget/reducers/group.rb +48 -5
  45. data/lib/wukong/widget/reducers/group_concat.rb +30 -2
  46. data/lib/wukong/widget/reducers/moments.rb +4 -4
  47. data/lib/wukong/widget/reducers/sort.rb +53 -3
  48. data/lib/wukong/widget/serializers.rb +37 -12
  49. data/lib/wukong/widget/utils.rb +1 -1
  50. data/spec/spec_helper.rb +20 -2
  51. data/spec/wukong/driver_spec.rb +2 -0
  52. data/spec/wukong/local/runner_spec.rb +40 -0
  53. data/spec/wukong/local_spec.rb +6 -0
  54. data/spec/wukong/logger_spec.rb +49 -0
  55. data/spec/wukong/processor_spec.rb +22 -0
  56. data/spec/wukong/runner_spec.rb +128 -8
  57. data/spec/wukong/widget/filters_spec.rb +28 -10
  58. data/spec/wukong/widget/processors_spec.rb +5 -5
  59. data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
  60. data/spec/wukong/widget/reducers/count_spec.rb +1 -1
  61. data/spec/wukong/widget/reducers/group_spec.rb +7 -6
  62. data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
  63. data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
  64. data/spec/wukong/widget/serializers_spec.rb +84 -88
  65. data/spec/wukong/wu-local_spec.rb +109 -0
  66. metadata +43 -20
  67. data/bin/wu-server +0 -70
  68. data/lib/wukong/boot.rb +0 -96
  69. data/lib/wukong/configuration.rb +0 -8
  70. data/lib/wukong/emitter.rb +0 -22
  71. data/lib/wukong/server.rb +0 -119
  72. data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
  73. data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
  74. data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
  75. data/spec/wukong/local_runner_spec.rb +0 -31
  76. data/spec/wukong/wu_local_spec.rb +0 -125
@@ -1,4 +1,4 @@
1
1
  module Wukong
2
2
  # The current version of Wukong.
3
- VERSION = '3.0.0.pre3'
3
+ VERSION = '3.0.0'
4
4
  end
@@ -5,6 +5,10 @@ module Wukong
5
5
  # criterion.
6
6
  class Filter < Processor
7
7
 
8
+ description <<EOF
9
+ A processor which filters input records according to some criterion.
10
+ EOF
11
+
8
12
  # Process a `record` by yielding it only if it should be
9
13
  # selected by this filter.
10
14
  #
@@ -61,6 +65,7 @@ module Wukong
61
65
  # @see Filter
62
66
  # @see Null
63
67
  class Identity < Filter
68
+ description "This processor passes all records unmodified."
64
69
  register
65
70
  end
66
71
 
@@ -84,6 +89,9 @@ module Wukong
84
89
  # @see Filter
85
90
  # @see All
86
91
  class Null < Filter
92
+
93
+ description "This processor acts as a filter which passes no records at all."
94
+
87
95
  # Prevents any records from passing because it always returns
88
96
  # `false`.
89
97
  #
@@ -117,8 +125,21 @@ module Wukong
117
125
  # @see NotRegexpFilter
118
126
  class RegexpFilter < Filter
119
127
 
120
- # The regular expression to use to match records.
121
- field :match, Regexp
128
+ description <<EOF
129
+ This processor only passes records which match against a given regular
130
+ expression.
131
+
132
+ $ cat input
133
+ apple
134
+ banana
135
+ cat
136
+ $ cat input | wu-local regexp --match='^a'
137
+ apple
138
+
139
+ If no --match argument is given, all records will be passed.
140
+ EOF
141
+
142
+ field :match, Regexp, :doc => "Regular expression to match against"
122
143
 
123
144
  # Selects a `record` only if it matches this widget's `match`
124
145
  # field.
@@ -154,6 +175,22 @@ module Wukong
154
175
  # @see Filter
155
176
  # @see NotRegexpFilter
156
177
  class NotRegexpFilter < RegexpFilter
178
+
179
+ description <<EOF
180
+ This processor only passes records which fail to match against a given
181
+ regular expression.
182
+
183
+ $ cat input
184
+ apple
185
+ banana
186
+ cat
187
+ $ cat input | wu-local not_regexp --match='^a'
188
+ banana
189
+ cat
190
+
191
+ If no --match argument is given, all records will be passed.
192
+ EOF
193
+
157
194
  # Select a `record` only if it <b>doesn't</b> match this
158
195
  # widget's `match` field.
159
196
  #
@@ -189,8 +226,22 @@ module Wukong
189
226
  # @see Filter
190
227
  class Limit < Filter
191
228
 
192
- # The maximum number of records to let pass.
193
- field :max, Integer, :default => Float::INFINITY
229
+ description <<EOF
230
+ This processor passes a certain number of records and then stops
231
+ passing any, acting as a limit.
232
+
233
+ $ cat input
234
+ 1
235
+ 2
236
+ 3
237
+ $ cat input | wu-local limit --max=2
238
+ 1
239
+ 2
240
+
241
+ If no --max argument is given, all records will be passed.
242
+ EOF
243
+
244
+ field :max, Integer, :doc => "Maximum number of records to let pass"
194
245
 
195
246
  # The current record count.
196
247
  attr_accessor :count
@@ -206,7 +257,7 @@ module Wukong
206
257
  # @param [Object] record
207
258
  # @return [true, false]
208
259
  def select?(record)
209
- keep = @count < max
260
+ keep = (max ? @count < max : true)
210
261
  @count += 1
211
262
  keep
212
263
  end
@@ -236,9 +287,23 @@ module Wukong
236
287
  # @see Limit
237
288
  class Sample < Filter
238
289
 
239
- # The fraction of records to let pass. Must be between 0.0 and
240
- # 10.0
241
- field :fraction, Float, :default => 1.0
290
+ description <<EOF
291
+ This processor will pass input records with a certain frequency,
292
+ acting as a random sampler.
293
+
294
+ $ cat input
295
+ 1
296
+ 2
297
+ 3
298
+ 4
299
+ $ cat input | wu-local sample --fraction=0.5
300
+ 1
301
+ 4
302
+
303
+ If no --fraction is given, all records will be passed.
304
+ EOF
305
+
306
+ field :fraction, Float, :default => 1.0, :doc => "Fraction of records to let pass. Must be between 0 and 1.0"
242
307
 
243
308
  # Selects a `record` randomly, with a probability given the the
244
309
  # `fraction` for this widget.
@@ -307,5 +372,66 @@ module Wukong
307
372
  register
308
373
  end
309
374
 
375
+ # Emit only the first `n` records.
376
+ #
377
+ # @see Filter
378
+ class Head < Filter
379
+
380
+ field :n, Integer, :default => 10, :doc => "Number of records to let pass"
381
+
382
+ # The current record count.
383
+ attr_accessor :count
384
+
385
+ # Initializes the record count to zero.
386
+ def setup
387
+ self.count = 0
388
+ end
389
+
390
+ # Select a record only if we're below the maximum number of
391
+ # records.
392
+ #
393
+ # @param [Object] record
394
+ # @return [true, false]
395
+ def select?(record)
396
+ keep = @count < n
397
+ @count += 1
398
+ keep
399
+ end
400
+ register
401
+ end
402
+
403
+ # Skip the first `n` records.
404
+ #
405
+ # Works slightly differently than the UNIX `tail` command which
406
+ # prints the last `n` records. This notion is less useful in a
407
+ # streaming context, so think of this filter as the equivalent of
408
+ # `tail -n+`.
409
+ #
410
+ # @see Filter
411
+ class Tail < Filter
412
+
413
+ field :n, Integer, :default => 0, :doc => "Number of records to skip before letting records pass"
414
+
415
+ # The current record count
416
+ attr_accessor :count
417
+
418
+ # Initializes the record count to zero.
419
+ def setup
420
+ self.count = 0
421
+ end
422
+
423
+ # Select a record only if we've already skipped the first `n`
424
+ # records.
425
+ #
426
+ # @param [Object]
427
+ # @return [true, false]
428
+ def select?(record)
429
+ keep = (@count >= n)
430
+ @count += 1
431
+ keep
432
+ end
433
+ register
434
+ end
435
+
310
436
  end
311
437
  end
@@ -22,8 +22,29 @@ module Wukong
22
22
  # ... | logger
23
23
  # end
24
24
  class Logger < Processor
25
- # The level to use for logging.
26
- field :level, Symbol, :default => :info
25
+ field :level, Symbol, :default => :info, :doc => "Log level priority"
26
+
27
+ description <<EOF
28
+ This processor passes all input records unmodified, making a log
29
+ statement on each one.
30
+
31
+ $ cat input
32
+ 1
33
+ 2
34
+ 3
35
+ $ cat input | wu-local logger
36
+ INFO 2013-01-04 17:10:59 [Logger ] -- 1
37
+ INFO 2013-01-04 17:10:59 [Logger ] -- 2
38
+ INFO 2013-01-04 17:10:59 [Logger ] -- 3
39
+
40
+ You can set the priority level of the log messages with the --level
41
+ flag.
42
+
43
+ $ cat input | wu-local logger --level=debug
44
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
45
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
46
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
47
+ EOF
27
48
 
28
49
  # Process a given `record` by logging it.
29
50
  #
@@ -100,8 +121,44 @@ module Wukong
100
121
  class Extract < Processor
101
122
  include DynamicGet
102
123
 
103
- # The part to extract.
104
- field :part, Whatever, :default => nil
124
+ description <<EOF
125
+ This processor will pass extracted parts of input records.
126
+
127
+ It can be used to extract a field from a delimited input
128
+
129
+ $ cat input
130
+ snap crackle pop
131
+ a b c
132
+ $ cat input | wu-local extract --part=2
133
+ crackle
134
+ b
135
+
136
+ The default separator is a tab character but you can specify this as
137
+ well
138
+
139
+ $ cat input
140
+ snap,crackle,pop
141
+ a,b,c
142
+ $ cat input | wu-local extract --part=2 --separator=,
143
+ crackle
144
+ b
145
+
146
+ It can also be used on JSON records, even those with nested fields
147
+
148
+ $ cat input
149
+ {"id": 1, {"data": {"text": "hi there"}}
150
+ {"id": 2, {"data": {"text": "goodbye"}}
151
+ $ cat input | wu-local extract --part=id
152
+ 1
153
+ 2
154
+ $ cat input | wu-local extract --part=data.text
155
+ hi there
156
+ goodbye
157
+
158
+ If no --part argument is given, the original record will be yielded.
159
+ EOF
160
+
161
+ field :part, Whatever, :default => nil, :doc => "Part of the record to extract"
105
162
 
106
163
  # Extract a `part` of a `record`.
107
164
  #
@@ -115,7 +172,9 @@ module Wukong
115
172
  end
116
173
 
117
174
  class Topic < Processor
118
- field :topic, Symbol
175
+
176
+ field :topic, Symbol, :doc => "Topic to label the record with"
177
+
119
178
  def process(record)
120
179
  yield perform_action(record)
121
180
  end
@@ -12,7 +12,7 @@ module Wukong
12
12
  # 0.03480
13
13
  # 0.74418
14
14
  # ...
15
- # $ cat input | wu-local bin
15
+ # $ cat input | wu-local bin --to=tsv
16
16
  #
17
17
  # 0.02935 0.12638500000000003 7
18
18
  # 0.12638500000000003 0.22342000000000004 11
@@ -20,7 +20,7 @@ module Wukong
20
20
  #
21
21
  # @example Control how the bins are defined and displayed
22
22
  #
23
- # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1
23
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --to=tsv
24
24
  # 0.0 0.1 10.0
25
25
  # 0.1 0.2 12.0
26
26
  # 0.2 0.3 8.0
@@ -28,7 +28,7 @@ module Wukong
28
28
  #
29
29
  # @example Include an additional column of normalized (fractional) counts
30
30
  #
31
- # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize
31
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize --to=tsv
32
32
  # 0.0 0.1 10.0 0.3
33
33
  # 0.1 0.2 12.0 0.36
34
34
  # 0.2 0.3 8.0 0.24
@@ -36,7 +36,7 @@ module Wukong
36
36
  #
37
37
  # @example Make a log-log histogram
38
38
  #
39
- # $ cat input | wu-local bin --log_bins --log_counts
39
+ # $ cat input | wu-local bin --log_bins --log_counts --to=tsv
40
40
  # 1.000 3.162 1.099
41
41
  # 3.162 10.000 1.946
42
42
  # 10.000 31.623 3.045
@@ -48,29 +48,79 @@ module Wukong
48
48
  # @example Use the bin at the end of a dataflow
49
49
  #
50
50
  # Wukong.processor(:bins_at_end) do
51
- # ... | extract(part: 'age') | bin(num_bins: 10)
51
+ # ... | extract(part: 'age') | bin(num_bins: 10) | to_tsv
52
52
  # end
53
53
  #
54
54
  # @see Accumulator
55
55
  # @see Extract
56
56
  class Bin < Accumulator
57
+
58
+ description <<EOF
59
+ This processor can be used to create a set of bins defining the
60
+ frequency distribution of the input records (or some part of each
61
+ input record).
62
+
63
+ Here's a simple example:
64
+
65
+ $ cat input.dat
66
+ 1
67
+ 2
68
+ 3
69
+ ...
70
+ 100
71
+
72
+ $ cat input.dat | wu-local bin --to=tsv
73
+ 1.000 10.900 10.000
74
+ 10.900 20.800 10.000
75
+ 20.800 30.700 10.000
76
+ 30.700 40.600 10.000
77
+ ...
78
+ 90.100 100.000 10.000
79
+
80
+ By default, all the input values are included and the number of bins
81
+ used corresponds to the square root of the number of input values.
82
+ You can customize the domain for the distribution, the number of bins,
83
+ or the explicit bin edges themselves, via the --min, --max,
84
+ --num_bins, and --edges flags.
85
+
86
+ You can control the display of numbers with the --format_string and
87
+ --precision options.
88
+
89
+ $ cat input.dat | wu-local bin --num_bins=4 --min=0 --max=100 --precision=0 --to=tsv
90
+ 0.0 25 24
91
+ 25 50 25
92
+ 50 75 25
93
+ 75 100 26
94
+
95
+ You can use the --log_bins, --log_counts, and --base options to use
96
+ logarithmically spaced bins or logarithmic counts within each bin to
97
+ the given base.
98
+
99
+ You can also normalize the distribution using the --normalize option.
100
+
101
+ $ cat input.dat | wu-local bin --num_bins=4 --log_bins --normalize --to=tsv
102
+ 1.000 3.162 3.000 0.030
103
+ 3.162 10.000 7.000 0.070
104
+ 10.000 31.623 21.000 0.210
105
+ 31.623 100.000 69.000 0.690
106
+ EOF
57
107
 
58
- field :num_bins, Integer
59
- field :edges, Array
60
- field :min, Float
61
- field :max, Float
108
+ field :num_bins, Integer, :doc => "Number of bins to use"
109
+ field :edges, Array, :doc => "Number of edges to use"
110
+ field :min, Float, :doc => "Smallest bin starting point"
111
+ field :max, Float, :doc => "Largest bin ending point"
62
112
 
63
- field :format_string, String
64
- field :precision, Integer, :default => 3
113
+ field :format_string, String, :doc => "Format string used when printing numerical values"
114
+ field :precision, Integer, :doc => "Precision used when printing numerical values", :default => 3
65
115
 
66
116
  include DynamicGet
67
- field :by, Whatever
117
+ field :by, Whatever, :doc => "Bin the values extracted by this label"
68
118
 
69
- field :log_bins, :boolean, :default => false
70
- field :log_counts, :boolean, :default => false
71
- field :base, Float, :default => Math::E
119
+ field :log_bins, :boolean, :default => false, :doc => "Use logarithmically spaced bins"
120
+ field :log_counts, :boolean, :default => false, :doc => "Use logarithmic bin counts"
121
+ field :base, Float, :default => Math::E, :doc => "Base for logarithms"
72
122
 
73
- field :normalize, :boolean, :default => false
123
+ field :normalize, :boolean, :default => false, :doc => "Normalize bin counts so they sum to 1.0"
74
124
 
75
125
  # The accumulated values
76
126
  attr_accessor :values
@@ -148,7 +198,7 @@ module Wukong
148
198
  if normalize && total_count > 0
149
199
  bin << log_count_if_necessary((count.to_f / total_count.to_f))
150
200
  end
151
- yield bin.map { |n| format(n) }.join("\t")
201
+ yield bin.map { |n| format(n) }
152
202
  end
153
203
  end
154
204
 
@@ -169,7 +219,7 @@ module Wukong
169
219
  when format_string
170
220
  format_string % n
171
221
  when n == 0.0
172
- 0.0
222
+ '0.0'
173
223
  when n.abs > 1000 || n.abs < 0.001
174
224
  "%#{precision}.#{precision}E" % n
175
225
  else
@@ -17,6 +17,18 @@ module Wukong
17
17
  # 283
18
18
  class Count < Accumulator
19
19
 
20
+ description <<EOF
21
+ This processor counts the number of input records it receives.
22
+
23
+ $ wc -l input
24
+ 283 input
25
+ $ cat input | wu-local count
26
+ 283
27
+
28
+ This processor will not output any records until it receives its final
29
+ input record.
30
+ EOF
31
+
20
32
  # The total size of the input recors.
21
33
  attr_accessor :size
22
34