wukong 3.0.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -1,5 +1,4 @@
1
1
  require 'configliere'
2
- require 'vayacondios-client'
3
2
  require 'multi_json'
4
3
  require 'eventmachine'
5
4
  require 'log4r'
@@ -35,6 +34,10 @@ module Wukong
35
34
 
36
35
  add_shortcut_method_for(:processor, ProcessorBuilder)
37
36
  add_shortcut_method_for(:dataflow, DataflowBuilder)
37
+
38
+ def self.doc_helpers_path
39
+ File.expand_path('../wukong/doc_helpers.rb', __FILE__)
40
+ end
38
41
 
39
42
  end
40
43
 
@@ -45,5 +48,7 @@ require_relative 'wukong/widgets'
45
48
  require_relative 'wukong/local'
46
49
 
47
50
  module Wukong
51
+
52
+ # Built-in Wukong processors and dataflows.
48
53
  BUILTINS = Set.new(Wukong.registry.show.keys)
49
54
  end
@@ -1,5 +1,12 @@
1
1
  module Wukong
2
- class DataflowBuilder < Hanuman::GraphBuilder
2
+
3
+ class Dataflow < Hanuman::Tree
4
+ def self.configure(settings)
5
+ settings.description = builder.description if builder.description
6
+ end
7
+ end
8
+
9
+ class DataflowBuilder < Hanuman::TreeBuilder
3
10
 
4
11
  def description desc=nil
5
12
  @description = desc if desc
@@ -10,17 +17,20 @@ module Wukong
10
17
 
11
18
  def handle_dsl_arguments_for(stage, *args, &action)
12
19
  options = args.extract_options!
20
+ while stages.include?(stage.label)
21
+ parts = stage.label.to_s.split('_')
22
+ if parts.last.to_i > 0
23
+ parts[-1] = parts.last.to_i + 1
24
+ else
25
+ parts.push(1)
26
+ end
27
+ stage.label = parts.map(&:to_s).join('_').to_sym
28
+ end
13
29
  stage.merge!(options.merge(action: action).compact)
14
- stage
30
+ stage.graph = self
31
+ stage
15
32
  end
16
33
 
17
- def linkable_name(direction)
18
- case direction
19
- when :in then directed_sort.first
20
- when :out then directed_sort.last
21
- end
22
- end
23
-
24
34
  def method_missing(name, *args, &blk)
25
35
  if stages[name]
26
36
  handle_dsl_arguments_for(stages[name], *args, &blk)
@@ -30,43 +40,4 @@ module Wukong
30
40
  end
31
41
 
32
42
  end
33
-
34
- class Dataflow < Hanuman::Graph
35
-
36
- def self.description desc=nil
37
- @description = desc if desc
38
- @description
39
- end
40
-
41
- def has_input?(stage)
42
- links.any?{ |link| link.into == stage }
43
- end
44
-
45
- def has_output?(stage)
46
- links.any?{ |link| link.from == stage }
47
- end
48
-
49
- def connected?(stage)
50
- input = has_input?(stage) || stages[stage].is_a?(Wukong::Source)
51
- output = has_output?(stage) || stages[stage].is_a?(Wukong::Sink)
52
- input && output
53
- end
54
-
55
- def complete?
56
- stages.all?{ |(name, stage)| connected? name }
57
- end
58
-
59
- def setup
60
- directed_sort.each{ |name| stages[name].setup }
61
- end
62
-
63
- def run
64
- stages[directed_sort.first].run
65
- end
66
-
67
- def stop
68
- directed_sort.each{ |name| stages[name].stop }
69
- end
70
-
71
- end
72
43
  end
@@ -1,103 +1,214 @@
1
+ require_relative('driver/wiring')
2
+
1
3
  module Wukong
4
+
5
+ # A Driver is a class including the DriverMethods module which
6
+ # connects a Dataflow or Processor to the external world of inputs
7
+ # and outputs.
8
+ #
9
+ # @example Minimal Driver class
10
+ #
11
+ # class MinimalDriver
12
+ # include Wukong::DriverMethods
13
+ # def initialize(label, settings)
14
+ # construct_dataflow(label, settings)
15
+ # end
16
+ # def process record
17
+ # puts record
18
+ # end
19
+ # end
20
+ #
21
+ # The MinimalDriver#send_through_dataflow method can be called on an
22
+ # instance of MinimalDriver with any input record.
23
+ #
24
+ # This record will be passed through the dataflow, starting from its
25
+ # root, and each record yielded at the leaves of the dataflow will
26
+ # be passed to the driver's #process method.
27
+ #
28
+ # The #process method of an implementing driver should *not* yield,
29
+ # unlike the process method of a Processor class. Instead, it
30
+ # should treat its argument as an output of the dataflow and do
31
+ # something appropriate to the driver (write to file, database,
32
+ # terminal, &c.).
33
+ #
34
+ # Drivers are also responsible for implementing the lifecycle of
35
+ # processors and dataflows they drive. A more complete version of
36
+ # the above driver class would:
37
+ #
38
+ # * call the #setup_dataflow method when ready to trigger the
39
+ # Processor#setup method on each processor in the dataflow
40
+ #
41
+ # * call the #finalize_dataflow method when indicating that the
42
+ # dataflow should consider a batch of records complete
43
+ #
44
+ # * call the #finalize_and_stop_dataflow method to indicate the
45
+ # last batch of records and to trigger the Processor#stop method
46
+ # on each processor in the dataflow
47
+ #
48
+ # Driver instances are started by Runners which should delegate to
49
+ # the `start` method driver class itself.
50
+ #
51
+ # @see Wukong::Local::StdioDriver for a complete example of a driver.
52
+ # @see Wukong::Local::Runner for an example of how runners call drivers.
2
53
  module DriverMethods
3
54
 
4
- attr_accessor :dataflow
5
-
55
+ attr_accessor :label
6
56
  attr_accessor :settings
57
+ attr_accessor :dataflow
7
58
 
8
- def driver
9
- @driver ||= Driver.new(dataflow)
59
+ # Classes including DriverMethods should override this method with
60
+ # some way of handling the `output_record` that is appropriate for
61
+ # the driver.
62
+ #
63
+ # @param [Object] output_record
64
+ def process output_record
65
+ raise NotImplementedError.new("Define the #{self.class}#process method to handle output records from the dataflow")
10
66
  end
11
67
 
12
- def lookup(label)
13
- raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
14
- Wukong.registry.retrieve(label.to_sym)
68
+ # Construct a dataflow from the given `label` and `settings`.
69
+ #
70
+ # This method does **not** cause Processor#setup to be called on
71
+ # any of the processors in the dataflow. Call the #setup_dataflow
72
+ # method to explicitly have setup occur. This distinction is
73
+ # useful for drivers which themselves need to do complex
74
+ # initialization before letting processors in the dataflow
75
+ # initialize.
76
+ #
77
+ # @param [Symbol] label the name of the dataflow (or processor) to build
78
+ # @param [Hash] settings
79
+ # @param settings [String] :to Serialize all output via the named serializer (json, tsv)
80
+ # @param settings [String] :from Deserialize all input via the named deserializer (json, tsv)
81
+ # @param settings [String] :as Recordize each input as instances of the given class
82
+ #
83
+ # @see #setup_dataflow
84
+ def construct_dataflow(label, settings={})
85
+ self.label = label
86
+ self.settings = settings
87
+ prepend(:recordize) if settings[:as]
88
+ prepend("from_#{settings[:from]}".to_sym) if settings[:from]
89
+ append("to_#{settings[:to]}".to_sym) if settings[:to]
90
+ build_dataflow
15
91
  end
16
-
17
- def lookup_and_build(label, options = {})
18
- lookup(label).build(options)
19
- end
20
-
21
- def build_serializer(direction, label, options)
22
- lookup_and_build("#{direction}_#{label}", options)
92
+
93
+ # Set up this driver. Called before setting up any of the
94
+ # dataflow stages.
95
+ def setup
23
96
  end
24
97
 
25
- def add_serialization(dataflow, direction, label, options)
26
- case direction
27
- when :to then dataflow.push build_serializer(direction, label, options)
28
- when :from then dataflow.unshift build_serializer(direction, label, options)
98
+ # Walks the dataflow and calls Processor#setup on each of the
99
+ # processors.
100
+ def setup_dataflow
101
+ setup
102
+ dataflow.each_stage do |stage|
103
+ stage.setup
29
104
  end
30
105
  end
31
106
 
32
- def setup_dataflow
33
- dataflow.each(&:setup)
107
+ # Send the given `record` through the dataflow.
108
+ #
109
+ # @param [Object] record
110
+ def send_through_dataflow(record)
111
+ wiring.start_with(dataflow.root).call(record)
34
112
  end
35
113
 
114
+ # Perform finalization code for this driver. Runs after #setup
115
+ # and before #stop.
116
+ def finalize
117
+ end
118
+
119
+ # Indicate a full batch of records has already been sent through
120
+ # and any batch-oriented or accumulative operations should trigger
121
+ # (e.g. - counting).
122
+ #
123
+ # Walks the dataflow calling Processor#finalize on each processor.
124
+ #
125
+ # On the *last* batch, the #finalize_and_stop_dataflow method
126
+ # should be called instead.
127
+ #
128
+ # @see #finalize_and_stop_dataflow
36
129
  def finalize_dataflow
37
- dataflow.each do |stage|
38
- stage.finalize(&driver.advance(stage)) if stage.respond_to?(:finalize)
130
+ finalize
131
+ dataflow.each_stage do |stage|
132
+ stage.finalize(&wiring.advance(stage))
39
133
  end
40
134
  end
41
135
 
136
+ # Works similar to #finalize_dataflow but calls Processor#stop
137
+ # after calling Processor#finalize on each processor.
42
138
  def finalize_and_stop_dataflow
43
- dataflow.each do |stage|
44
- stage.finalize(&driver.advance(stage)) if stage.respond_to?(:finalize)
139
+ finalize
140
+ dataflow.each_stage do |stage|
141
+ stage.finalize(&wiring.advance(stage))
45
142
  stage.stop
46
- end
143
+ end
144
+ stop
47
145
  end
48
146
 
49
- # So pretty...
50
- def construct_dataflow(label, options)
51
- dataflow = lookup_and_build(label, options)
52
- dataflow = dataflow.respond_to?(:stages) ? dataflow.directed_sort.map{ |name| dataflow.stages[name] } : [ dataflow ]
53
- expected_input_model = (options[:consumes].constantize rescue nil) || dataflow.first.expected_record_type(:consumes)
54
- dataflow.unshift lookup_and_build(:recordize, model: expected_input_model) if expected_input_model
55
- expected_output_model = (options[:produces].constantize rescue nil) || dataflow.first.expected_record_type(:produces)
56
- dataflow.push lookup_and_build(:recordize, model: expected_output_model) if expected_output_model
57
- expected_input_serialization = options[:from] || dataflow.last.expected_serialization(:from)
58
- add_serialization(dataflow, :from, expected_input_serialization, options) if expected_input_serialization
59
- expected_output_serialization = options[:to] || dataflow.last.expected_serialization(:to)
60
- add_serialization(dataflow, :to, expected_output_serialization, options) if expected_output_serialization
61
- dataflow.push self
62
- end
63
- end
147
+ # Perform shutdown code for this driver. Called after #finalize
148
+ # and after all stages have been finalized and stopped.
149
+ def stop
150
+ end
64
151
 
65
- class Driver
66
- attr_accessor :dataflow
152
+ protected
67
153
 
68
- def initialize(dataflow)
69
- @dataflow = dataflow
154
+ # The builder for this driver's `label`, either for a Processor or
155
+ # a Dataflow.
156
+ #
157
+ # @return [Wukong::ProcessorBuilder, Wukong::DataflowBuilder]
158
+ def builder
159
+ return @builder if @builder
160
+ raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
161
+ @builder = Wukong.registry.retrieve(label.to_sym)
70
162
  end
71
163
 
72
- def to_proc
73
- return @wiring if @wiring
74
- @wiring = Proc.new do |stage, record|
75
- stage.process(record, &advance(stage)) if stage
76
- end
164
+ # Return the builder for this driver's dataflow.
165
+ #
166
+ # Even if a Processor was originally named by this driver's
167
+ # `label`, a DataflowBuilder will be returned here. The
168
+ # DataflowBuilder is itself built from just the ProcessorBuilder
169
+ # alone.
170
+ #
171
+ # @return [Wukong::DataflowBuilder]
172
+ # @see #builder
173
+ def dataflow_builder
174
+ @dataflow_builder ||= (builder.is_a?(DataflowBuilder) ? builder : Wukong::DataflowBuilder.receive(for_class: Class.new(Wukong::Dataflow), stages: {label.to_sym => builder}))
77
175
  end
78
176
 
79
- def send_through_dataflow(record)
80
- start_with(dataflow.first).call(record)
81
- end
82
-
83
- def start_with(stage)
84
- to_proc.curry.call(stage)
177
+ # Build the dataflow using the #dataflow_builder and the supplied
178
+ # `settings`.
179
+ #
180
+ # @return [Wukong::Dataflow]
181
+ def build_dataflow
182
+ self.dataflow = dataflow_builder.build(settings)
85
183
  end
86
184
 
87
- def advance(stage)
88
- next_stage = stage_iterator(stage)
89
- start_with(next_stage)
185
+ # Add the processor with the given `new_label` in front of this
186
+ # driver's dataflow, making it into the new root of the dataflow.
187
+ #
188
+ # @param [Symbol] new_label
189
+ def prepend new_label
190
+ raise Wukong::Error.new("could not find processor <#{new_label}> to prepend") unless Wukong.registry.registered?(new_label)
191
+ dataflow_builder.prepend(Wukong.registry.retrieve(new_label))
90
192
  end
91
193
 
92
- # This should properly be defined on dataflow/builder
93
- def stage_iterator(stage)
94
- position = dataflow.find_index(stage)
95
- dataflow[position + 1]
96
- end
194
+ # Add the processor with the given `new_label` at the end of each
195
+ # of this driver's dataflow's leaves.
196
+ #
197
+ # @param [Symbol] new_label
198
+ def append new_label
199
+ raise Wukong::Error.new("could not find processor <#{new_label}> to append") unless Wukong.registry.registered?(new_label)
200
+ dataflow_builder.append(Wukong.registry.retrieve(new_label))
201
+ end
97
202
 
98
- def call(*args)
99
- to_proc.call(*args)
203
+ # Returns the underlying Wiring object that will coordinate
204
+ # transfer of records from the driver to the dataflow and back to
205
+ # the driver.
206
+ #
207
+ # @return [Wiring]
208
+ def wiring
209
+ @wiring ||= Wiring.new(self, dataflow)
100
210
  end
101
211
 
102
212
  end
213
+
103
214
  end
@@ -1,12 +1,7 @@
1
1
  module Wukong
2
-
3
- # A module which can be included by other drivers which lets them
4
- # use EventMachine under the hood.
5
2
  module EventMachineDriver
6
-
7
3
  include DriverMethods
8
4
 
9
- # :nodoc:
10
5
  def self.included klass
11
6
  klass.class_eval do
12
7
  def self.add_signal_traps
@@ -15,13 +10,6 @@ module Wukong
15
10
  end
16
11
  end
17
12
  end
18
-
19
- # :nodoc:
20
- def initialize(label, settings)
21
- super
22
- @settings = settings
23
- @dataflow = construct_dataflow(label, settings)
24
- end
25
-
13
+
26
14
  end
27
15
  end
@@ -0,0 +1,68 @@
1
+ module Wukong
2
+
3
+ # Provides a very Ruby-minded way of walking a dataflow connected to
4
+ # a driver.
5
+ class Wiring
6
+
7
+ # The driver instance that likely calls the #start_with method and
8
+ # provides a #process method to be called by this wiring.
9
+ attr_accessor :driver
10
+
11
+ # The dataflow being wired.
12
+ attr_accessor :dataflow
13
+
14
+ # Construct a new Wiring for the given `driver` and `dataflow`.
15
+ #
16
+ # @param [#process] driver
17
+ # @param [Wukong::Dataflow] dataflow
18
+ def initialize(driver, dataflow)
19
+ @driver = driver
20
+ @dataflow = dataflow
21
+ end
22
+
23
+ # Return a proc which, if called with a record, will process that
24
+ # record through each of the given `stages` as well as through the
25
+ # rest of the dataflow ahead of them.
26
+ #
27
+ # @param [Array<Wukong::Stage>] stages
28
+ # @return [Proc]
29
+ def start_with(*stages)
30
+ to_proc.curry.call(stages)
31
+ end
32
+
33
+ # Return a proc (the output of #start_with) which will process
34
+ # records through the stages that are ahead of the given stage.
35
+ #
36
+ # @param [Wukong::Stage] stage
37
+ # @return [Proc]
38
+ #
39
+ # @see #start_with
40
+ def advance(stage)
41
+ # This is where the tree of procs will terminate, but only after
42
+ # having passed all output records through the driver -- the
43
+ # last "stage".
44
+ return start_with() if stage.nil? || stage == driver
45
+
46
+ # Otherwise we're still in the middle of the tree...
47
+ descendents = dataflow.descendents(stage)
48
+ if descendents.empty?
49
+ # No descendents it means we've reached a leaf of the tree so
50
+ # we'll run records through the driver to generate output.
51
+ start_with(driver)
52
+ else
53
+ # Otherwise continue down the tree of procs...
54
+ start_with(*descendents)
55
+ end
56
+ end
57
+
58
+ # :nodoc:
59
+ def to_proc
60
+ return @wiring if @wiring
61
+ @wiring = Proc.new do |stages, record|
62
+ stages.each do |stage|
63
+ stage.process(record, &advance(stage)) if stage
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end