wukong 3.0.0.pre3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +689 -50
  3. data/bin/wu-local +1 -74
  4. data/diagrams/wu_local.dot +39 -0
  5. data/diagrams/wu_local.dot.png +0 -0
  6. data/examples/loadable.rb +2 -0
  7. data/examples/string_reverser.rb +7 -0
  8. data/lib/hanuman/stage.rb +2 -2
  9. data/lib/wukong.rb +21 -10
  10. data/lib/wukong/dataflow.rb +2 -5
  11. data/lib/wukong/doc_helpers.rb +14 -0
  12. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  13. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  14. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  15. data/lib/wukong/driver.rb +11 -1
  16. data/lib/wukong/local.rb +40 -0
  17. data/lib/wukong/local/event_machine_driver.rb +27 -0
  18. data/lib/wukong/local/runner.rb +98 -0
  19. data/lib/wukong/local/stdio_driver.rb +44 -0
  20. data/lib/wukong/local/tcp_driver.rb +47 -0
  21. data/lib/wukong/logger.rb +16 -7
  22. data/lib/wukong/plugin.rb +48 -0
  23. data/lib/wukong/processor.rb +57 -15
  24. data/lib/wukong/rake_helper.rb +6 -0
  25. data/lib/wukong/runner.rb +151 -128
  26. data/lib/wukong/runner/boot_sequence.rb +123 -0
  27. data/lib/wukong/runner/code_loader.rb +52 -0
  28. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  29. data/lib/wukong/runner/help_message.rb +42 -0
  30. data/lib/wukong/spec_helpers.rb +4 -12
  31. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  32. data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
  33. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  34. data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
  35. data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
  36. data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
  37. data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
  38. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
  39. data/lib/wukong/version.rb +1 -1
  40. data/lib/wukong/widget/filters.rb +134 -8
  41. data/lib/wukong/widget/processors.rb +64 -5
  42. data/lib/wukong/widget/reducers/bin.rb +68 -18
  43. data/lib/wukong/widget/reducers/count.rb +12 -0
  44. data/lib/wukong/widget/reducers/group.rb +48 -5
  45. data/lib/wukong/widget/reducers/group_concat.rb +30 -2
  46. data/lib/wukong/widget/reducers/moments.rb +4 -4
  47. data/lib/wukong/widget/reducers/sort.rb +53 -3
  48. data/lib/wukong/widget/serializers.rb +37 -12
  49. data/lib/wukong/widget/utils.rb +1 -1
  50. data/spec/spec_helper.rb +20 -2
  51. data/spec/wukong/driver_spec.rb +2 -0
  52. data/spec/wukong/local/runner_spec.rb +40 -0
  53. data/spec/wukong/local_spec.rb +6 -0
  54. data/spec/wukong/logger_spec.rb +49 -0
  55. data/spec/wukong/processor_spec.rb +22 -0
  56. data/spec/wukong/runner_spec.rb +128 -8
  57. data/spec/wukong/widget/filters_spec.rb +28 -10
  58. data/spec/wukong/widget/processors_spec.rb +5 -5
  59. data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
  60. data/spec/wukong/widget/reducers/count_spec.rb +1 -1
  61. data/spec/wukong/widget/reducers/group_spec.rb +7 -6
  62. data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
  63. data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
  64. data/spec/wukong/widget/serializers_spec.rb +84 -88
  65. data/spec/wukong/wu-local_spec.rb +109 -0
  66. metadata +43 -20
  67. data/bin/wu-server +0 -70
  68. data/lib/wukong/boot.rb +0 -96
  69. data/lib/wukong/configuration.rb +0 -8
  70. data/lib/wukong/emitter.rb +0 -22
  71. data/lib/wukong/server.rb +0 -119
  72. data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
  73. data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
  74. data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
  75. data/spec/wukong/local_runner_spec.rb +0 -31
  76. data/spec/wukong/wu_local_spec.rb +0 -125
@@ -21,7 +21,7 @@ module Wukong
21
21
  # banana
22
22
  # apple
23
23
  # ...
24
- # $ cat input | wu-local sort | wu-local group
24
+ # $ cat input | wu-local sort | wu-local group --to=tsv
25
25
  # apple 4
26
26
  # banana 2
27
27
  # cat 5
@@ -34,7 +34,7 @@ module Wukong
34
34
  # {"id": 2, "word": "cat" }
35
35
  # {"id": 3, "word": "banana"}
36
36
  # ...
37
- # $ cat input | wu-local sort --on==word | wu-local group --by=word
37
+ # $ cat input | wu-local sort --on=word | wu-local group --by=word --to=tsv
38
38
  # apple 4
39
39
  # banana 2
40
40
  # cat 5
@@ -46,14 +46,57 @@ module Wukong
46
46
  # @example Using a group at the end of a dataflow
47
47
  #
48
48
  # Wukong.dataflow(:makes_groups) do
49
- # ... | sort(on: 'field') | group(by: 'field')
49
+ # ... | sort(on: 'field') | group(by: 'field') | to_tsv
50
50
  # end
51
51
  #
52
52
  # @see Sort
53
53
  class Group < Count
54
54
 
55
+ description <<EOF
56
+ This processor groups consecutive input records that share the same
57
+ "group key". There are several ways to extract this group key from a
58
+ record.
59
+
60
+ NOTE: The input records must be previously sorted by the
61
+ same key used for grouping in order to ensure that groups are
62
+ not split up.
63
+
64
+ By default the input records themselves are used as their own group
65
+ keys, allowing to count identical values, a la `uniq -c`:
66
+
67
+ $ cat input
68
+ apple
69
+ cat
70
+ banana
71
+ apple
72
+ ...
73
+
74
+ $ cat input | wu-local sort | wu-local group --to=tsv
75
+ apple 4
76
+ banana 2
77
+ cat 5
78
+ ...
79
+
80
+ You can also group by some part of in input record:
81
+
82
+ $ cat input
83
+ {"id": 1, "word": "apple" }
84
+ {"id": 2, "word": "cat" }
85
+ {"id": 3, "word": "banana"}
86
+ ...
87
+
88
+ $ cat input | wu-local sort --on==word | wu-local group --by=word --to=tsv
89
+ apple 4
90
+ banana 2
91
+ cat 5
92
+ ...
93
+
94
+ This processor will not produce any output for a given group until it
95
+ sees the last record of that group.
96
+ EOF
97
+
55
98
  include DynamicGet
56
- field :by, Whatever
99
+ field :by, Whatever, :doc => "Part of the record to group by"
57
100
 
58
101
  # Get the key which defines the group for this `record`.
59
102
  #
@@ -76,7 +119,7 @@ module Wukong
76
119
  # @yieldparam [Object] key the key defining the group
77
120
  # @yieldparam [Integer] size the size of the group
78
121
  def finalize
79
- yield [key, size].map(&:to_s).join("\t")
122
+ yield [key, size]
80
123
  end
81
124
 
82
125
  register
@@ -13,7 +13,7 @@ module Wukong
13
13
  # {"id": 2, "parent_id": 3}
14
14
  # {"id": 3, "parent_id": 3}
15
15
  # ...
16
- # $ cat input | wu-local group_concat --by=parent_id
16
+ # $ cat input | wu-local group_concat --by=parent_id --to=tsv
17
17
  # 4 1 {"id": 1, "parent_id": 4}
18
18
  # 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
19
19
  # ...
@@ -23,6 +23,34 @@ module Wukong
23
23
  # @see Group
24
24
  class GroupConcat < Group
25
25
 
26
+ description <<EOF
27
+ This processor concatenates records of a consecutive group of records
28
+ into a single record.
29
+
30
+ $ cat input
31
+ {"id": 1, "parent_id": 4}
32
+ {"id": 2, "parent_id": 3}
33
+ {"id": 3, "parent_id": 3}
34
+ ...
35
+
36
+ $ cat input | wu-local group_concat --by=parent_id --to=tsv
37
+ 4 1 {"id": 1, "parent_id": 4}
38
+ 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
39
+ ...
40
+
41
+ Each output record consists of tab-separated fields in the following
42
+ order:
43
+
44
+ 1) The key defining the group of input records in this output record
45
+ 2) The number of input records in the group
46
+ 3) Each input record in the group
47
+ ...
48
+
49
+ This processor will not produce any output for a given group until it
50
+ sees the last record of that group. See the documentation for the
51
+ 'group' processor for more information.
52
+ EOF
53
+
26
54
  # The members of the current group.
27
55
  attr_accessor :members
28
56
 
@@ -58,7 +86,7 @@ module Wukong
58
86
  def finalize
59
87
  group = [key, size]
60
88
  group.concat(members)
61
- yield group.map(&:to_s).join("\t")
89
+ yield group
62
90
  end
63
91
 
64
92
  register
@@ -4,12 +4,12 @@ module Wukong
4
4
  class Processor
5
5
  class Moments < Group
6
6
 
7
- field :group_by, Whatever
7
+ field :group_by, Whatever, :doc => "Part of the record to group by"
8
8
 
9
9
  attr_accessor :measurements
10
10
 
11
- field :of, Array, :default => []
12
- field :std_dev, :boolean, :default => true
11
+ field :of, Array, :default => [], :doc => "Parts of the record to measure moments of"
12
+ field :no_std_dev, :boolean, :doc => "Don't compute standard deviations"
13
13
 
14
14
  def get_key record
15
15
  super(record) unless (self.group_by || self.by)
@@ -52,7 +52,7 @@ module Wukong
52
52
 
53
53
  mean = values.inject(0.0) { |sum, value| sum += value } / count
54
54
  r[property][:mean] = mean
55
- if std_dev
55
+ unless no_std_dev
56
56
  variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
57
57
  std = Math.sqrt(variance)
58
58
  r[property][:std_dev] = std
@@ -62,11 +62,61 @@ module Wukong
62
62
  # group
63
63
  # end
64
64
  class Sort < Accumulator
65
+
66
+ description <<EOF
67
+ This processor sorts input records alphabetically or numerically based
68
+ on their value or the value of one of their parts.
69
+
70
+ NOTE: For many use cases you're better off using native tools like
71
+ `/bin/sort` because they are faster and already do what you
72
+ need.
73
+
74
+ You can sort simple inputs
75
+
76
+ $ cat input
77
+ 1 apple
78
+ 2 banana
79
+ 3 cat
80
+ 4 banana
81
+ ...
82
+
83
+ $ cat input | wu-local sort --on=2
84
+ 1 apple
85
+ 2 banana
86
+ 4 banana
87
+ 3 cat
88
+ ...
89
+
90
+ as well as complicated ones
91
+
92
+ $ cat input
93
+ {"id": 1, "word": "apple" }
94
+ {"id": 2, "word": "cat" }
95
+ {"id": 3, "word": "banana"}
96
+ ...
97
+
98
+ $ cat input | wu-local sort --on=word
99
+ {"id": 1, "word": "apple" }
100
+ {"id": 3, "word": "banana"}
101
+ {"id": 2, "word": "cat" }
102
+ ...
103
+
104
+ You can also sort in --reverse or using --numeric order instead of
105
+ lexical.
106
+
107
+ The sort widget is useful for modeling Hadoop jobs, but don't
108
+ forget that [Hadoop does its own
109
+ sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
110
+ so the sort widget doesn't belong in your map/reduce jobs.
111
+
112
+ This processor will not produce any output till it has received all
113
+ its input records.
114
+ EOF
65
115
 
66
116
  include DynamicGet
67
- field :on, Whatever
68
- field :reverse, :boolean, :default => false
69
- field :numeric, :boolean, :default => false
117
+ field :on, Whatever, :doc => "Part of the record to sort on"
118
+ field :reverse, :boolean, :default => false, :doc => "Sort in reverse order"
119
+ field :numeric, :boolean, :default => false, :doc => "Sort numerically instead of lexically"
70
120
 
71
121
  # Intializes the array of records that will hold all the values.
72
122
  def setup
@@ -1,14 +1,14 @@
1
1
  module Wukong
2
2
  class Processor
3
3
 
4
- SerializerError = Class.new(StandardError)
4
+ SerializerError = Class.new(Error)
5
5
 
6
6
  class Serializer < Processor
7
- field :on_error, String, default: 'log'
7
+ field :on_error, String, default: 'log', :doc => "Action to take upon an error, either 'log' or 'notify'"
8
8
 
9
9
  def handle_error(record, err)
10
10
  case on_error
11
- when 'log' then log.warn "Bad record: #{record}"
11
+ when 'log' then log.warn "#{err.class}: #{err.message}"
12
12
  when 'notify' then notify('error', record: record, error: err)
13
13
  end
14
14
  end
@@ -25,7 +25,14 @@ module Wukong
25
25
  #
26
26
  # @see FromJson
27
27
  class ToJson < Serializer
28
- field :pretty, :boolean, default: false
28
+
29
+ description <<EOF
30
+ Turns input records into JSON strings.
31
+
32
+ Pretty print input with the --pretty flag.
33
+ EOF
34
+
35
+ field :pretty, :boolean, default: false, :doc => "Pretty-print output"
29
36
 
30
37
  # Yields the input `record` serialized as JSON.
31
38
  #
@@ -56,6 +63,16 @@ module Wukong
56
63
  #
57
64
  # @see ToJson
58
65
  class FromJson < Serializer
66
+
67
+ description <<EOF
68
+ Parse JSON input records into native Ruby objects.
69
+
70
+ $ cat input.json
71
+ {"hi": "there"}
72
+ $ cat input.json | wu-local from_json
73
+ {"hi"=>"there"}
74
+ EOF
75
+
59
76
  # Yields the input `record` deserialized from JSON.
60
77
  #
61
78
  # @param [String] json
@@ -196,7 +213,7 @@ module Wukong
196
213
  #
197
214
  # @see FromDelimited
198
215
  class ToDelimited < Serializer
199
- field :delimiter, String, :default => "\t"
216
+ field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
200
217
  # Yields the input `record` serialized in a delimited format..
201
218
  #
202
219
  # @param [Object] record
@@ -224,7 +241,7 @@ module Wukong
224
241
  #
225
242
  # @see ToDelimited
226
243
  class FromDelimited < Serializer
227
- field :delimiter, String, :default => "\t"
244
+ field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
228
245
  # Yields the input `record` deserialized from a delimited format.
229
246
  #
230
247
  # @param [String] delimited
@@ -260,18 +277,26 @@ module Wukong
260
277
  end
261
278
  register
262
279
  end
263
-
280
+
281
+ # A widget for turning a record into an instance of some class.
282
+ # The class must provide a "class method" `receive` which accepts
283
+ # a Hash argument.
264
284
  class Recordize < Serializer
265
- field :model, Whatever
285
+ field :model, Whatever, :doc => "Model class to turn records into"
266
286
 
287
+ # Turn the given `record` into an instance of the class named
288
+ # with the `model` field.
289
+ #
290
+ # @param [Hash, #to_wire] record
291
+ # @return [Object]
267
292
  def process(record)
268
293
  wire_format = record.try(:to_wire) || record
269
- raise SerializerError.new("Record must be in hash format to be recordized") unless wire_format.is_a?(Hash)
294
+ raise SerializerError.new("Can only recordize a Hash-like record") unless wire_format.is_a?(Hash)
270
295
  yield model.receive(wire_format)
271
- rescue => e
272
- handle_error(record, e)
296
+ rescue => e
297
+ handle_error(record, e)
273
298
  end
274
299
  register
275
- end
300
+ end
276
301
  end
277
302
  end
@@ -7,7 +7,7 @@ module Wukong
7
7
 
8
8
  # :nodoc:
9
9
  def self.included klass
10
- klass.send(:field, :separator, String, :default => "\t")
10
+ klass.send(:field, :separator, String, :default => "\t", :doc => "The default separator between fields on a single line")
11
11
  end
12
12
 
13
13
  # :nodoc:
@@ -2,12 +2,30 @@ require 'wukong'
2
2
  require 'wukong/spec_helpers'
3
3
  require_relative './support/shared_examples_for_shortcuts'
4
4
  require_relative './support/shared_examples_for_builders'
5
- require_relative './support/integration_helper'
6
5
  require_relative './support/shared_context_for_reducers'
7
6
 
8
7
  RSpec.configure do |config|
8
+
9
9
  config.mock_with :rspec
10
+
10
11
  include Wukong::SpecHelpers
11
- include Wukong::Local::IntegrationHelper
12
+ def root
13
+ @root ||= Pathname.new(File.expand_path('../..', __FILE__))
14
+ end
15
+
16
+ def local_runner *args
17
+ runner(Wukong::Local::LocalRunner, 'wu-local', *args)
18
+ end
19
+
20
+ def generic_runner *args
21
+ runner(Wukong::Runner, 'wu-generic', *args)
22
+ end
23
+
24
+ def wu_local *args
25
+ command('wu-local', *args)
26
+ end
27
+
28
+ # FIXME Why is this here?
12
29
  config.treat_symbols_as_metadata_keys_with_true_values = true
30
+
13
31
  end
@@ -0,0 +1,2 @@
1
+ require 'spec_helper'
2
+
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Local::LocalRunner do
4
+ before { EM.stub!(:run) }
5
+
6
+ describe "choosing a processor name" do
7
+
8
+ it "raises an error without any arguments" do
9
+ expect { local_runner() }.to raise_error(Wukong::Error, /must provide.*processor.*run.*argument/i)
10
+ end
11
+
12
+ it "raises an error when passed the name of a processor that isn't registered" do
13
+ expect { local_runner('some_proc_that_dont_exit') }.to raise_error(Wukong::Error, /no such processor.*some_proc.*/i)
14
+ end
15
+
16
+ it "accepts an explicit --run argument" do
17
+ local_runner('--run=identity').processor.should == 'identity'
18
+ end
19
+
20
+ it "accepts a registered processor name from the first argument" do
21
+ local_runner('identity').processor.should == 'identity'
22
+ end
23
+
24
+ it "accepts a registerd processor name from the the basename of the first file argument" do
25
+ local_runner(examples_dir('string_reverser.rb')).processor.should == 'string_reverser'
26
+ end
27
+ end
28
+
29
+ describe "uses a" do
30
+ it "StdioDriver by default" do
31
+ local_runner('identity').driver.should == Wukong::Local::StdioDriver
32
+ end
33
+
34
+ it "TCPDriver when given a --port argument" do
35
+ local_runner('identity','--tcp_port=6000').driver.should == Wukong::Local::TCPDriver
36
+ end
37
+
38
+ end
39
+
40
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Local do
4
+ it_behaves_like 'a plugin'
5
+ end
6
+
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Logging do
4
+
5
+ let(:loggable) { Class.new.class_eval { include Wukong::Logging } }
6
+ let(:model) { Class.new.class_eval { include Gorillib::Model } }
7
+
8
+ describe "a class including Wukong::Logging" do
9
+ subject { loggable }
10
+
11
+ it { should respond_to(:log) }
12
+
13
+ describe "has instances that" do
14
+ let(:loggable_instance) { loggable.new }
15
+ subject { loggable_instance }
16
+
17
+ it { should respond_to(:log) }
18
+ describe "have an instance method #log that" do
19
+ let(:log) { loggable_instance.log }
20
+ subject { log }
21
+
22
+ it { should respond_to(:debug) }
23
+ it { should respond_to(:info) }
24
+ it { should respond_to(:warn) }
25
+ end
26
+ end
27
+
28
+ describe "has subclasses" do
29
+ let(:child) { Class.new(loggable) }
30
+
31
+ describe "with instances that" do
32
+ subject { child.new }
33
+
34
+ it { should respond_to(:log) }
35
+ end
36
+ end
37
+ end
38
+
39
+ describe "a class including Gorillib::Model and then Wukong::Logging" do
40
+ let(:loggable_model) { model.class_eval { include Wukong::Logging } }
41
+ subject { loggable_model }
42
+
43
+ describe "has fields that" do
44
+ subject { loggable_model.fields }
45
+ it { should include(:log) }
46
+ end
47
+ end
48
+ end
49
+