wukong 3.0.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -1,5 +1,4 @@
1
1
  require 'configliere'
2
- require 'vayacondios-client'
3
2
  require 'multi_json'
4
3
  require 'eventmachine'
5
4
  require 'log4r'
@@ -35,6 +34,10 @@ module Wukong
35
34
 
36
35
  add_shortcut_method_for(:processor, ProcessorBuilder)
37
36
  add_shortcut_method_for(:dataflow, DataflowBuilder)
37
+
38
+ def self.doc_helpers_path
39
+ File.expand_path('../wukong/doc_helpers.rb', __FILE__)
40
+ end
38
41
 
39
42
  end
40
43
 
@@ -45,5 +48,7 @@ require_relative 'wukong/widgets'
45
48
  require_relative 'wukong/local'
46
49
 
47
50
  module Wukong
51
+
52
+ # Built-in Wukong processors and dataflows.
48
53
  BUILTINS = Set.new(Wukong.registry.show.keys)
49
54
  end
@@ -1,5 +1,12 @@
1
1
  module Wukong
2
- class DataflowBuilder < Hanuman::GraphBuilder
2
+
3
+ class Dataflow < Hanuman::Tree
4
+ def self.configure(settings)
5
+ settings.description = builder.description if builder.description
6
+ end
7
+ end
8
+
9
+ class DataflowBuilder < Hanuman::TreeBuilder
3
10
 
4
11
  def description desc=nil
5
12
  @description = desc if desc
@@ -10,17 +17,20 @@ module Wukong
10
17
 
11
18
  def handle_dsl_arguments_for(stage, *args, &action)
12
19
  options = args.extract_options!
20
+ while stages.include?(stage.label)
21
+ parts = stage.label.to_s.split('_')
22
+ if parts.last.to_i > 0
23
+ parts[-1] = parts.last.to_i + 1
24
+ else
25
+ parts.push(1)
26
+ end
27
+ stage.label = parts.map(&:to_s).join('_').to_sym
28
+ end
13
29
  stage.merge!(options.merge(action: action).compact)
14
- stage
30
+ stage.graph = self
31
+ stage
15
32
  end
16
33
 
17
- def linkable_name(direction)
18
- case direction
19
- when :in then directed_sort.first
20
- when :out then directed_sort.last
21
- end
22
- end
23
-
24
34
  def method_missing(name, *args, &blk)
25
35
  if stages[name]
26
36
  handle_dsl_arguments_for(stages[name], *args, &blk)
@@ -30,43 +40,4 @@ module Wukong
30
40
  end
31
41
 
32
42
  end
33
-
34
- class Dataflow < Hanuman::Graph
35
-
36
- def self.description desc=nil
37
- @description = desc if desc
38
- @description
39
- end
40
-
41
- def has_input?(stage)
42
- links.any?{ |link| link.into == stage }
43
- end
44
-
45
- def has_output?(stage)
46
- links.any?{ |link| link.from == stage }
47
- end
48
-
49
- def connected?(stage)
50
- input = has_input?(stage) || stages[stage].is_a?(Wukong::Source)
51
- output = has_output?(stage) || stages[stage].is_a?(Wukong::Sink)
52
- input && output
53
- end
54
-
55
- def complete?
56
- stages.all?{ |(name, stage)| connected? name }
57
- end
58
-
59
- def setup
60
- directed_sort.each{ |name| stages[name].setup }
61
- end
62
-
63
- def run
64
- stages[directed_sort.first].run
65
- end
66
-
67
- def stop
68
- directed_sort.each{ |name| stages[name].stop }
69
- end
70
-
71
- end
72
43
  end
@@ -1,103 +1,214 @@
1
+ require_relative('driver/wiring')
2
+
1
3
  module Wukong
4
+
5
+ # A Driver is a class including the DriverMethods module which
6
+ # connects a Dataflow or Processor to the external world of inputs
7
+ # and outputs.
8
+ #
9
+ # @example Minimal Driver class
10
+ #
11
+ # class MinimalDriver
12
+ # include Wukong::DriverMethods
13
+ # def initialize(label, settings)
14
+ # construct_dataflow(label, settings)
15
+ # end
16
+ # def process record
17
+ # puts record
18
+ # end
19
+ # end
20
+ #
21
+ # The MinimalDriver#send_through_dataflow method can be called on an
22
+ # instance of MinimalDriver with any input record.
23
+ #
24
+ # This record will be passed through the dataflow, starting from its
25
+ # root, and each record yielded at the leaves of the dataflow will
26
+ # be passed to the driver's #process method.
27
+ #
28
+ # The #process method of an implementing driver should *not* yield,
29
+ # unlike the process method of a Processor class. Instead, it
30
+ # should treat its argument as an output of the dataflow and do
31
+ # something appropriate to the driver (write to file, database,
32
+ # terminal, &c.).
33
+ #
34
+ # Drivers are also responsible for implementing the lifecycle of
35
+ # processors and dataflows they drive. A more complete version of
36
+ # the above driver class would:
37
+ #
38
+ # * call the #setup_dataflow method when ready to trigger the
39
+ # Processor#setup method on each processor in the dataflow
40
+ #
41
+ # * call the #finalize_dataflow method when indicating that the
42
+ # dataflow should consider a batch of records complete
43
+ #
44
+ # * call the #finalize_and_stop_dataflow method to indicate the
45
+ # last batch of records and to trigger the Processor#stop method
46
+ # on each processor in the dataflow
47
+ #
48
+ # Driver instances are started by Runners which should delegate to
49
+ # the `start` method driver class itself.
50
+ #
51
+ # @see Wukong::Local::StdioDriver for a complete example of a driver.
52
+ # @see Wukong::Local::Runner for an example of how runners call drivers.
2
53
  module DriverMethods
3
54
 
4
- attr_accessor :dataflow
5
-
55
+ attr_accessor :label
6
56
  attr_accessor :settings
57
+ attr_accessor :dataflow
7
58
 
8
- def driver
9
- @driver ||= Driver.new(dataflow)
59
+ # Classes including DriverMethods should override this method with
60
+ # some way of handling the `output_record` that is appropriate for
61
+ # the driver.
62
+ #
63
+ # @param [Object] output_record
64
+ def process output_record
65
+ raise NotImplementedError.new("Define the #{self.class}#process method to handle output records from the dataflow")
10
66
  end
11
67
 
12
- def lookup(label)
13
- raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
14
- Wukong.registry.retrieve(label.to_sym)
68
+ # Construct a dataflow from the given `label` and `settings`.
69
+ #
70
+ # This method does **not** cause Processor#setup to be called on
71
+ # any of the processors in the dataflow. Call the #setup_dataflow
72
+ # method to explicitly have setup occur. This distinction is
73
+ # useful for drivers which themselves need to do complex
74
+ # initialization before letting processors in the dataflow
75
+ # initialize.
76
+ #
77
+ # @param [Symbol] label the name of the dataflow (or processor) to build
78
+ # @param [Hash] settings
79
+ # @param settings [String] :to Serialize all output via the named serializer (json, tsv)
80
+ # @param settings [String] :from Deserialize all input via the named deserializer (json, tsv)
81
+ # @param settings [String] :as Recordize each input as instances of the given class
82
+ #
83
+ # @see #setup_dataflow
84
+ def construct_dataflow(label, settings={})
85
+ self.label = label
86
+ self.settings = settings
87
+ prepend(:recordize) if settings[:as]
88
+ prepend("from_#{settings[:from]}".to_sym) if settings[:from]
89
+ append("to_#{settings[:to]}".to_sym) if settings[:to]
90
+ build_dataflow
15
91
  end
16
-
17
- def lookup_and_build(label, options = {})
18
- lookup(label).build(options)
19
- end
20
-
21
- def build_serializer(direction, label, options)
22
- lookup_and_build("#{direction}_#{label}", options)
92
+
93
+ # Set up this driver. Called before setting up any of the
94
+ # dataflow stages.
95
+ def setup
23
96
  end
24
97
 
25
- def add_serialization(dataflow, direction, label, options)
26
- case direction
27
- when :to then dataflow.push build_serializer(direction, label, options)
28
- when :from then dataflow.unshift build_serializer(direction, label, options)
98
+ # Walks the dataflow and calls Processor#setup on each of the
99
+ # processors.
100
+ def setup_dataflow
101
+ setup
102
+ dataflow.each_stage do |stage|
103
+ stage.setup
29
104
  end
30
105
  end
31
106
 
32
- def setup_dataflow
33
- dataflow.each(&:setup)
107
+ # Send the given `record` through the dataflow.
108
+ #
109
+ # @param [Object] record
110
+ def send_through_dataflow(record)
111
+ wiring.start_with(dataflow.root).call(record)
34
112
  end
35
113
 
114
+ # Perform finalization code for this driver. Runs after #setup
115
+ # and before #stop.
116
+ def finalize
117
+ end
118
+
119
+ # Indicate a full batch of records has already been sent through
120
+ # and any batch-oriented or accumulative operations should trigger
121
+ # (e.g. - counting).
122
+ #
123
+ # Walks the dataflow calling Processor#finalize on each processor.
124
+ #
125
+ # On the *last* batch, the #finalize_and_stop_dataflow method
126
+ # should be called instead.
127
+ #
128
+ # @see #finalize_and_stop_dataflow
36
129
  def finalize_dataflow
37
- dataflow.each do |stage|
38
- stage.finalize(&driver.advance(stage)) if stage.respond_to?(:finalize)
130
+ finalize
131
+ dataflow.each_stage do |stage|
132
+ stage.finalize(&wiring.advance(stage))
39
133
  end
40
134
  end
41
135
 
136
+ # Works similar to #finalize_dataflow but calls Processor#stop
137
+ # after calling Processor#finalize on each processor.
42
138
  def finalize_and_stop_dataflow
43
- dataflow.each do |stage|
44
- stage.finalize(&driver.advance(stage)) if stage.respond_to?(:finalize)
139
+ finalize
140
+ dataflow.each_stage do |stage|
141
+ stage.finalize(&wiring.advance(stage))
45
142
  stage.stop
46
- end
143
+ end
144
+ stop
47
145
  end
48
146
 
49
- # So pretty...
50
- def construct_dataflow(label, options)
51
- dataflow = lookup_and_build(label, options)
52
- dataflow = dataflow.respond_to?(:stages) ? dataflow.directed_sort.map{ |name| dataflow.stages[name] } : [ dataflow ]
53
- expected_input_model = (options[:consumes].constantize rescue nil) || dataflow.first.expected_record_type(:consumes)
54
- dataflow.unshift lookup_and_build(:recordize, model: expected_input_model) if expected_input_model
55
- expected_output_model = (options[:produces].constantize rescue nil) || dataflow.first.expected_record_type(:produces)
56
- dataflow.push lookup_and_build(:recordize, model: expected_output_model) if expected_output_model
57
- expected_input_serialization = options[:from] || dataflow.last.expected_serialization(:from)
58
- add_serialization(dataflow, :from, expected_input_serialization, options) if expected_input_serialization
59
- expected_output_serialization = options[:to] || dataflow.last.expected_serialization(:to)
60
- add_serialization(dataflow, :to, expected_output_serialization, options) if expected_output_serialization
61
- dataflow.push self
62
- end
63
- end
147
+ # Perform shutdown code for this driver. Called after #finalize
148
+ # and after all stages have been finalized and stopped.
149
+ def stop
150
+ end
64
151
 
65
- class Driver
66
- attr_accessor :dataflow
152
+ protected
67
153
 
68
- def initialize(dataflow)
69
- @dataflow = dataflow
154
+ # The builder for this driver's `label`, either for a Processor or
155
+ # a Dataflow.
156
+ #
157
+ # @return [Wukong::ProcessorBuilder, Wukong::DataflowBuilder]
158
+ def builder
159
+ return @builder if @builder
160
+ raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
161
+ @builder = Wukong.registry.retrieve(label.to_sym)
70
162
  end
71
163
 
72
- def to_proc
73
- return @wiring if @wiring
74
- @wiring = Proc.new do |stage, record|
75
- stage.process(record, &advance(stage)) if stage
76
- end
164
+ # Return the builder for this driver's dataflow.
165
+ #
166
+ # Even if a Processor was originally named by this driver's
167
+ # `label`, a DataflowBuilder will be returned here. The
168
+ # DataflowBuilder is itself built from just the ProcessorBuilder
169
+ # alone.
170
+ #
171
+ # @return [Wukong::DataflowBuilder]
172
+ # @see #builder
173
+ def dataflow_builder
174
+ @dataflow_builder ||= (builder.is_a?(DataflowBuilder) ? builder : Wukong::DataflowBuilder.receive(for_class: Class.new(Wukong::Dataflow), stages: {label.to_sym => builder}))
77
175
  end
78
176
 
79
- def send_through_dataflow(record)
80
- start_with(dataflow.first).call(record)
81
- end
82
-
83
- def start_with(stage)
84
- to_proc.curry.call(stage)
177
+ # Build the dataflow using the #dataflow_builder and the supplied
178
+ # `settings`.
179
+ #
180
+ # @return [Wukong::Dataflow]
181
+ def build_dataflow
182
+ self.dataflow = dataflow_builder.build(settings)
85
183
  end
86
184
 
87
- def advance(stage)
88
- next_stage = stage_iterator(stage)
89
- start_with(next_stage)
185
+ # Add the processor with the given `new_label` in front of this
186
+ # driver's dataflow, making it into the new root of the dataflow.
187
+ #
188
+ # @param [Symbol] new_label
189
+ def prepend new_label
190
+ raise Wukong::Error.new("could not find processor <#{new_label}> to prepend") unless Wukong.registry.registered?(new_label)
191
+ dataflow_builder.prepend(Wukong.registry.retrieve(new_label))
90
192
  end
91
193
 
92
- # This should properly be defined on dataflow/builder
93
- def stage_iterator(stage)
94
- position = dataflow.find_index(stage)
95
- dataflow[position + 1]
96
- end
194
+ # Add the processor with the given `new_label` at the end of each
195
+ # of this driver's dataflow's leaves.
196
+ #
197
+ # @param [Symbol] new_label
198
+ def append new_label
199
+ raise Wukong::Error.new("could not find processor <#{new_label}> to append") unless Wukong.registry.registered?(new_label)
200
+ dataflow_builder.append(Wukong.registry.retrieve(new_label))
201
+ end
97
202
 
98
- def call(*args)
99
- to_proc.call(*args)
203
+ # Returns the underlying Wiring object that will coordinate
204
+ # transfer of records from the driver to the dataflow and back to
205
+ # the driver.
206
+ #
207
+ # @return [Wiring]
208
+ def wiring
209
+ @wiring ||= Wiring.new(self, dataflow)
100
210
  end
101
211
 
102
212
  end
213
+
103
214
  end
@@ -1,12 +1,7 @@
1
1
  module Wukong
2
-
3
- # A module which can be included by other drivers which lets them
4
- # use EventMachine under the hood.
5
2
  module EventMachineDriver
6
-
7
3
  include DriverMethods
8
4
 
9
- # :nodoc:
10
5
  def self.included klass
11
6
  klass.class_eval do
12
7
  def self.add_signal_traps
@@ -15,13 +10,6 @@ module Wukong
15
10
  end
16
11
  end
17
12
  end
18
-
19
- # :nodoc:
20
- def initialize(label, settings)
21
- super
22
- @settings = settings
23
- @dataflow = construct_dataflow(label, settings)
24
- end
25
-
13
+
26
14
  end
27
15
  end
@@ -0,0 +1,68 @@
1
+ module Wukong
2
+
3
+ # Provides a very Ruby-minded way of walking a dataflow connected to
4
+ # a driver.
5
+ class Wiring
6
+
7
+ # The driver instance that likely calls the #start_with method and
8
+ # provides a #process method to be called by this wiring.
9
+ attr_accessor :driver
10
+
11
+ # The dataflow being wired.
12
+ attr_accessor :dataflow
13
+
14
+ # Construct a new Wiring for the given `driver` and `dataflow`.
15
+ #
16
+ # @param [#process] driver
17
+ # @param [Wukong::Dataflow] dataflow
18
+ def initialize(driver, dataflow)
19
+ @driver = driver
20
+ @dataflow = dataflow
21
+ end
22
+
23
+ # Return a proc which, if called with a record, will process that
24
+ # record through each of the given `stages` as well as through the
25
+ # rest of the dataflow ahead of them.
26
+ #
27
+ # @param [Array<Wukong::Stage>] stages
28
+ # @return [Proc]
29
+ def start_with(*stages)
30
+ to_proc.curry.call(stages)
31
+ end
32
+
33
+ # Return a proc (the output of #start_with) which will process
34
+ # records through the stages that are ahead of the given stage.
35
+ #
36
+ # @param [Wukong::Stage] stage
37
+ # @return [Proc]
38
+ #
39
+ # @see #start_with
40
+ def advance(stage)
41
+ # This is where the tree of procs will terminate, but only after
42
+ # having passed all output records through the driver -- the
43
+ # last "stage".
44
+ return start_with() if stage.nil? || stage == driver
45
+
46
+ # Otherwise we're still in the middle of the tree...
47
+ descendents = dataflow.descendents(stage)
48
+ if descendents.empty?
49
+ # No descendents it means we've reached a leaf of the tree so
50
+ # we'll run records through the driver to generate output.
51
+ start_with(driver)
52
+ else
53
+ # Otherwise continue down the tree of procs...
54
+ start_with(*descendents)
55
+ end
56
+ end
57
+
58
+ # :nodoc:
59
+ def to_proc
60
+ return @wiring if @wiring
61
+ @wiring = Proc.new do |stages, record|
62
+ stages.each do |stage|
63
+ stage.process(record, &advance(stage)) if stage
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end