wukong 3.0.0.pre3 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +689 -50
  3. data/bin/wu-local +1 -74
  4. data/diagrams/wu_local.dot +39 -0
  5. data/diagrams/wu_local.dot.png +0 -0
  6. data/examples/loadable.rb +2 -0
  7. data/examples/string_reverser.rb +7 -0
  8. data/lib/hanuman/stage.rb +2 -2
  9. data/lib/wukong.rb +21 -10
  10. data/lib/wukong/dataflow.rb +2 -5
  11. data/lib/wukong/doc_helpers.rb +14 -0
  12. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  13. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  14. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  15. data/lib/wukong/driver.rb +11 -1
  16. data/lib/wukong/local.rb +40 -0
  17. data/lib/wukong/local/event_machine_driver.rb +27 -0
  18. data/lib/wukong/local/runner.rb +98 -0
  19. data/lib/wukong/local/stdio_driver.rb +44 -0
  20. data/lib/wukong/local/tcp_driver.rb +47 -0
  21. data/lib/wukong/logger.rb +16 -7
  22. data/lib/wukong/plugin.rb +48 -0
  23. data/lib/wukong/processor.rb +57 -15
  24. data/lib/wukong/rake_helper.rb +6 -0
  25. data/lib/wukong/runner.rb +151 -128
  26. data/lib/wukong/runner/boot_sequence.rb +123 -0
  27. data/lib/wukong/runner/code_loader.rb +52 -0
  28. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  29. data/lib/wukong/runner/help_message.rb +42 -0
  30. data/lib/wukong/spec_helpers.rb +4 -12
  31. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  32. data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
  33. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  34. data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
  35. data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
  36. data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
  37. data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
  38. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
  39. data/lib/wukong/version.rb +1 -1
  40. data/lib/wukong/widget/filters.rb +134 -8
  41. data/lib/wukong/widget/processors.rb +64 -5
  42. data/lib/wukong/widget/reducers/bin.rb +68 -18
  43. data/lib/wukong/widget/reducers/count.rb +12 -0
  44. data/lib/wukong/widget/reducers/group.rb +48 -5
  45. data/lib/wukong/widget/reducers/group_concat.rb +30 -2
  46. data/lib/wukong/widget/reducers/moments.rb +4 -4
  47. data/lib/wukong/widget/reducers/sort.rb +53 -3
  48. data/lib/wukong/widget/serializers.rb +37 -12
  49. data/lib/wukong/widget/utils.rb +1 -1
  50. data/spec/spec_helper.rb +20 -2
  51. data/spec/wukong/driver_spec.rb +2 -0
  52. data/spec/wukong/local/runner_spec.rb +40 -0
  53. data/spec/wukong/local_spec.rb +6 -0
  54. data/spec/wukong/logger_spec.rb +49 -0
  55. data/spec/wukong/processor_spec.rb +22 -0
  56. data/spec/wukong/runner_spec.rb +128 -8
  57. data/spec/wukong/widget/filters_spec.rb +28 -10
  58. data/spec/wukong/widget/processors_spec.rb +5 -5
  59. data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
  60. data/spec/wukong/widget/reducers/count_spec.rb +1 -1
  61. data/spec/wukong/widget/reducers/group_spec.rb +7 -6
  62. data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
  63. data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
  64. data/spec/wukong/widget/serializers_spec.rb +84 -88
  65. data/spec/wukong/wu-local_spec.rb +109 -0
  66. metadata +43 -20
  67. data/bin/wu-server +0 -70
  68. data/lib/wukong/boot.rb +0 -96
  69. data/lib/wukong/configuration.rb +0 -8
  70. data/lib/wukong/emitter.rb +0 -22
  71. data/lib/wukong/server.rb +0 -119
  72. data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
  73. data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
  74. data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
  75. data/spec/wukong/local_runner_spec.rb +0 -31
  76. data/spec/wukong/wu_local_spec.rb +0 -125
@@ -21,7 +21,7 @@ module Wukong
21
21
  # banana
22
22
  # apple
23
23
  # ...
24
- # $ cat input | wu-local sort | wu-local group
24
+ # $ cat input | wu-local sort | wu-local group --to=tsv
25
25
  # apple 4
26
26
  # banana 2
27
27
  # cat 5
@@ -34,7 +34,7 @@ module Wukong
34
34
  # {"id": 2, "word": "cat" }
35
35
  # {"id": 3, "word": "banana"}
36
36
  # ...
37
- # $ cat input | wu-local sort --on==word | wu-local group --by=word
37
+ # $ cat input | wu-local sort --on=word | wu-local group --by=word --to=tsv
38
38
  # apple 4
39
39
  # banana 2
40
40
  # cat 5
@@ -46,14 +46,57 @@ module Wukong
46
46
  # @example Using a group at the end of a dataflow
47
47
  #
48
48
  # Wukong.dataflow(:makes_groups) do
49
- # ... | sort(on: 'field') | group(by: 'field')
49
+ # ... | sort(on: 'field') | group(by: 'field') | to_tsv
50
50
  # end
51
51
  #
52
52
  # @see Sort
53
53
  class Group < Count
54
54
 
55
+ description <<EOF
56
+ This processor groups consecutive input records that share the same
57
+ "group key". There are several ways to extract this group key from a
58
+ record.
59
+
60
+ NOTE: The input records must be previously sorted by the
61
+ same key used for grouping in order to ensure that groups are
62
+ not split up.
63
+
64
+ By default the input records themselves are used as their own group
65
+ keys, allowing to count identical values, a la `uniq -c`:
66
+
67
+ $ cat input
68
+ apple
69
+ cat
70
+ banana
71
+ apple
72
+ ...
73
+
74
+ $ cat input | wu-local sort | wu-local group --to=tsv
75
+ apple 4
76
+ banana 2
77
+ cat 5
78
+ ...
79
+
80
+ You can also group by some part of in input record:
81
+
82
+ $ cat input
83
+ {"id": 1, "word": "apple" }
84
+ {"id": 2, "word": "cat" }
85
+ {"id": 3, "word": "banana"}
86
+ ...
87
+
88
+ $ cat input | wu-local sort --on==word | wu-local group --by=word --to=tsv
89
+ apple 4
90
+ banana 2
91
+ cat 5
92
+ ...
93
+
94
+ This processor will not produce any output for a given group until it
95
+ sees the last record of that group.
96
+ EOF
97
+
55
98
  include DynamicGet
56
- field :by, Whatever
99
+ field :by, Whatever, :doc => "Part of the record to group by"
57
100
 
58
101
  # Get the key which defines the group for this `record`.
59
102
  #
@@ -76,7 +119,7 @@ module Wukong
76
119
  # @yieldparam [Object] key the key defining the group
77
120
  # @yieldparam [Integer] size the size of the group
78
121
  def finalize
79
- yield [key, size].map(&:to_s).join("\t")
122
+ yield [key, size]
80
123
  end
81
124
 
82
125
  register
@@ -13,7 +13,7 @@ module Wukong
13
13
  # {"id": 2, "parent_id": 3}
14
14
  # {"id": 3, "parent_id": 3}
15
15
  # ...
16
- # $ cat input | wu-local group_concat --by=parent_id
16
+ # $ cat input | wu-local group_concat --by=parent_id --to=tsv
17
17
  # 4 1 {"id": 1, "parent_id": 4}
18
18
  # 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
19
19
  # ...
@@ -23,6 +23,34 @@ module Wukong
23
23
  # @see Group
24
24
  class GroupConcat < Group
25
25
 
26
+ description <<EOF
27
+ This processor concatenates records of a consecutive group of records
28
+ into a single record.
29
+
30
+ $ cat input
31
+ {"id": 1, "parent_id": 4}
32
+ {"id": 2, "parent_id": 3}
33
+ {"id": 3, "parent_id": 3}
34
+ ...
35
+
36
+ $ cat input | wu-local group_concat --by=parent_id --to=tsv
37
+ 4 1 {"id": 1, "parent_id": 4}
38
+ 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
39
+ ...
40
+
41
+ Each output record consists of tab-separated fields in the following
42
+ order:
43
+
44
+ 1) The key defining the group of input records in this output record
45
+ 2) The number of input records in the group
46
+ 3) Each input record in the group
47
+ ...
48
+
49
+ This processor will not produce any output for a given group until it
50
+ sees the last record of that group. See the documentation for the
51
+ 'group' processor for more information.
52
+ EOF
53
+
26
54
  # The members of the current group.
27
55
  attr_accessor :members
28
56
 
@@ -58,7 +86,7 @@ module Wukong
58
86
  def finalize
59
87
  group = [key, size]
60
88
  group.concat(members)
61
- yield group.map(&:to_s).join("\t")
89
+ yield group
62
90
  end
63
91
 
64
92
  register
@@ -4,12 +4,12 @@ module Wukong
4
4
  class Processor
5
5
  class Moments < Group
6
6
 
7
- field :group_by, Whatever
7
+ field :group_by, Whatever, :doc => "Part of the record to group by"
8
8
 
9
9
  attr_accessor :measurements
10
10
 
11
- field :of, Array, :default => []
12
- field :std_dev, :boolean, :default => true
11
+ field :of, Array, :default => [], :doc => "Parts of the record to measure moments of"
12
+ field :no_std_dev, :boolean, :doc => "Don't compute standard deviations"
13
13
 
14
14
  def get_key record
15
15
  super(record) unless (self.group_by || self.by)
@@ -52,7 +52,7 @@ module Wukong
52
52
 
53
53
  mean = values.inject(0.0) { |sum, value| sum += value } / count
54
54
  r[property][:mean] = mean
55
- if std_dev
55
+ unless no_std_dev
56
56
  variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
57
57
  std = Math.sqrt(variance)
58
58
  r[property][:std_dev] = std
@@ -62,11 +62,61 @@ module Wukong
62
62
  # group
63
63
  # end
64
64
  class Sort < Accumulator
65
+
66
+ description <<EOF
67
+ This processor sorts input records alphabetically or numerically based
68
+ on their value or the value of one of their parts.
69
+
70
+ NOTE: For many use cases you're better off using native tools like
71
+ `/bin/sort` because they are faster and already do what you
72
+ need.
73
+
74
+ You can sort simple inputs
75
+
76
+ $ cat input
77
+ 1 apple
78
+ 2 banana
79
+ 3 cat
80
+ 4 banana
81
+ ...
82
+
83
+ $ cat input | wu-local sort --on=2
84
+ 1 apple
85
+ 2 banana
86
+ 4 banana
87
+ 3 cat
88
+ ...
89
+
90
+ as well as complicated ones
91
+
92
+ $ cat input
93
+ {"id": 1, "word": "apple" }
94
+ {"id": 2, "word": "cat" }
95
+ {"id": 3, "word": "banana"}
96
+ ...
97
+
98
+ $ cat input | wu-local sort --on=word
99
+ {"id": 1, "word": "apple" }
100
+ {"id": 3, "word": "banana"}
101
+ {"id": 2, "word": "cat" }
102
+ ...
103
+
104
+ You can also sort in --reverse or using --numeric order instead of
105
+ lexical.
106
+
107
+ The sort widget is useful for modeling Hadoop jobs, but don't
108
+ forget that [Hadoop does its own
109
+ sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
110
+ so the sort widget doesn't belong in your map/reduce jobs.
111
+
112
+ This processor will not produce any output till it has received all
113
+ its input records.
114
+ EOF
65
115
 
66
116
  include DynamicGet
67
- field :on, Whatever
68
- field :reverse, :boolean, :default => false
69
- field :numeric, :boolean, :default => false
117
+ field :on, Whatever, :doc => "Part of the record to sort on"
118
+ field :reverse, :boolean, :default => false, :doc => "Sort in reverse order"
119
+ field :numeric, :boolean, :default => false, :doc => "Sort numerically instead of lexically"
70
120
 
71
121
  # Intializes the array of records that will hold all the values.
72
122
  def setup
@@ -1,14 +1,14 @@
1
1
  module Wukong
2
2
  class Processor
3
3
 
4
- SerializerError = Class.new(StandardError)
4
+ SerializerError = Class.new(Error)
5
5
 
6
6
  class Serializer < Processor
7
- field :on_error, String, default: 'log'
7
+ field :on_error, String, default: 'log', :doc => "Action to take upon an error, either 'log' or 'notify'"
8
8
 
9
9
  def handle_error(record, err)
10
10
  case on_error
11
- when 'log' then log.warn "Bad record: #{record}"
11
+ when 'log' then log.warn "#{err.class}: #{err.message}"
12
12
  when 'notify' then notify('error', record: record, error: err)
13
13
  end
14
14
  end
@@ -25,7 +25,14 @@ module Wukong
25
25
  #
26
26
  # @see FromJson
27
27
  class ToJson < Serializer
28
- field :pretty, :boolean, default: false
28
+
29
+ description <<EOF
30
+ Turns input records into JSON strings.
31
+
32
+ Pretty print input with the --pretty flag.
33
+ EOF
34
+
35
+ field :pretty, :boolean, default: false, :doc => "Pretty-print output"
29
36
 
30
37
  # Yields the input `record` serialized as JSON.
31
38
  #
@@ -56,6 +63,16 @@ module Wukong
56
63
  #
57
64
  # @see ToJson
58
65
  class FromJson < Serializer
66
+
67
+ description <<EOF
68
+ Parse JSON input records into native Ruby objects.
69
+
70
+ $ cat input.json
71
+ {"hi": "there"}
72
+ $ cat input.json | wu-local from_json
73
+ {"hi"=>"there"}
74
+ EOF
75
+
59
76
  # Yields the input `record` deserialized from JSON.
60
77
  #
61
78
  # @param [String] json
@@ -196,7 +213,7 @@ module Wukong
196
213
  #
197
214
  # @see FromDelimited
198
215
  class ToDelimited < Serializer
199
- field :delimiter, String, :default => "\t"
216
+ field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
200
217
  # Yields the input `record` serialized in a delimited format..
201
218
  #
202
219
  # @param [Object] record
@@ -224,7 +241,7 @@ module Wukong
224
241
  #
225
242
  # @see ToDelimited
226
243
  class FromDelimited < Serializer
227
- field :delimiter, String, :default => "\t"
244
+ field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
228
245
  # Yields the input `record` deserialized from a delimited format.
229
246
  #
230
247
  # @param [String] delimited
@@ -260,18 +277,26 @@ module Wukong
260
277
  end
261
278
  register
262
279
  end
263
-
280
+
281
+ # A widget for turning a record into an instance of some class.
282
+ # The class must provide a "class method" `receive` which accepts
283
+ # a Hash argument.
264
284
  class Recordize < Serializer
265
- field :model, Whatever
285
+ field :model, Whatever, :doc => "Model class to turn records into"
266
286
 
287
+ # Turn the given `record` into an instance of the class named
288
+ # with the `model` field.
289
+ #
290
+ # @param [Hash, #to_wire] record
291
+ # @return [Object]
267
292
  def process(record)
268
293
  wire_format = record.try(:to_wire) || record
269
- raise SerializerError.new("Record must be in hash format to be recordized") unless wire_format.is_a?(Hash)
294
+ raise SerializerError.new("Can only recordize a Hash-like record") unless wire_format.is_a?(Hash)
270
295
  yield model.receive(wire_format)
271
- rescue => e
272
- handle_error(record, e)
296
+ rescue => e
297
+ handle_error(record, e)
273
298
  end
274
299
  register
275
- end
300
+ end
276
301
  end
277
302
  end
@@ -7,7 +7,7 @@ module Wukong
7
7
 
8
8
  # :nodoc:
9
9
  def self.included klass
10
- klass.send(:field, :separator, String, :default => "\t")
10
+ klass.send(:field, :separator, String, :default => "\t", :doc => "The default separator between fields on a single line")
11
11
  end
12
12
 
13
13
  # :nodoc:
@@ -2,12 +2,30 @@ require 'wukong'
2
2
  require 'wukong/spec_helpers'
3
3
  require_relative './support/shared_examples_for_shortcuts'
4
4
  require_relative './support/shared_examples_for_builders'
5
- require_relative './support/integration_helper'
6
5
  require_relative './support/shared_context_for_reducers'
7
6
 
8
7
  RSpec.configure do |config|
8
+
9
9
  config.mock_with :rspec
10
+
10
11
  include Wukong::SpecHelpers
11
- include Wukong::Local::IntegrationHelper
12
+ def root
13
+ @root ||= Pathname.new(File.expand_path('../..', __FILE__))
14
+ end
15
+
16
+ def local_runner *args
17
+ runner(Wukong::Local::LocalRunner, 'wu-local', *args)
18
+ end
19
+
20
+ def generic_runner *args
21
+ runner(Wukong::Runner, 'wu-generic', *args)
22
+ end
23
+
24
+ def wu_local *args
25
+ command('wu-local', *args)
26
+ end
27
+
28
+ # FIXME Why is this here?
12
29
  config.treat_symbols_as_metadata_keys_with_true_values = true
30
+
13
31
  end
@@ -0,0 +1,2 @@
1
+ require 'spec_helper'
2
+
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Local::LocalRunner do
4
+ before { EM.stub!(:run) }
5
+
6
+ describe "choosing a processor name" do
7
+
8
+ it "raises an error without any arguments" do
9
+ expect { local_runner() }.to raise_error(Wukong::Error, /must provide.*processor.*run.*argument/i)
10
+ end
11
+
12
+ it "raises an error when passed the name of a processor that isn't registered" do
13
+ expect { local_runner('some_proc_that_dont_exit') }.to raise_error(Wukong::Error, /no such processor.*some_proc.*/i)
14
+ end
15
+
16
+ it "accepts an explicit --run argument" do
17
+ local_runner('--run=identity').processor.should == 'identity'
18
+ end
19
+
20
+ it "accepts a registered processor name from the first argument" do
21
+ local_runner('identity').processor.should == 'identity'
22
+ end
23
+
24
+ it "accepts a registerd processor name from the the basename of the first file argument" do
25
+ local_runner(examples_dir('string_reverser.rb')).processor.should == 'string_reverser'
26
+ end
27
+ end
28
+
29
+ describe "uses a" do
30
+ it "StdioDriver by default" do
31
+ local_runner('identity').driver.should == Wukong::Local::StdioDriver
32
+ end
33
+
34
+ it "TCPDriver when given a --port argument" do
35
+ local_runner('identity','--tcp_port=6000').driver.should == Wukong::Local::TCPDriver
36
+ end
37
+
38
+ end
39
+
40
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Local do
4
+ it_behaves_like 'a plugin'
5
+ end
6
+
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Logging do
4
+
5
+ let(:loggable) { Class.new.class_eval { include Wukong::Logging } }
6
+ let(:model) { Class.new.class_eval { include Gorillib::Model } }
7
+
8
+ describe "a class including Wukong::Logging" do
9
+ subject { loggable }
10
+
11
+ it { should respond_to(:log) }
12
+
13
+ describe "has instances that" do
14
+ let(:loggable_instance) { loggable.new }
15
+ subject { loggable_instance }
16
+
17
+ it { should respond_to(:log) }
18
+ describe "have an instance method #log that" do
19
+ let(:log) { loggable_instance.log }
20
+ subject { log }
21
+
22
+ it { should respond_to(:debug) }
23
+ it { should respond_to(:info) }
24
+ it { should respond_to(:warn) }
25
+ end
26
+ end
27
+
28
+ describe "has subclasses" do
29
+ let(:child) { Class.new(loggable) }
30
+
31
+ describe "with instances that" do
32
+ subject { child.new }
33
+
34
+ it { should respond_to(:log) }
35
+ end
36
+ end
37
+ end
38
+
39
+ describe "a class including Gorillib::Model and then Wukong::Logging" do
40
+ let(:loggable_model) { model.class_eval { include Wukong::Logging } }
41
+ subject { loggable_model }
42
+
43
+ describe "has fields that" do
44
+ subject { loggable_model.fields }
45
+ it { should include(:log) }
46
+ end
47
+ end
48
+ end
49
+