wukong 3.0.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -6,7 +6,6 @@ shared_examples_for 'a processor' do |options = {}|
6
6
  it{ processor(options[:named]).processor.should respond_to(:process) }
7
7
  it{ processor(options[:named]).processor.should respond_to(:finalize) }
8
8
  it{ processor(options[:named]).processor.should respond_to(:stop) }
9
- it{ processor(options[:named]).processor.should respond_to(:notify) }
10
9
  end
11
10
 
12
11
  shared_examples_for 'a plugin' do |options = {}|
@@ -99,13 +99,14 @@ module Wukong
99
99
  # let(:french_tokenizer) { processor(:complex_tokenizer, stemming: true, language: 'fr') }
100
100
  # ...
101
101
  # end
102
- def unit_test_runner *args
102
+ def unit_test_runner *args, &block
103
103
  settings = args.extract_options!
104
104
  name = (args.first || self.class.description)
105
- runner = UnitTestRunner.new(name, settings)
106
- yield runner.driver.processor if block_given?
107
- runner.boot!
108
- runner.driver
105
+ UnitTestRunner.new(name, settings).tap do |the_runner|
106
+ the_runner.program_name = 'wu-local'
107
+ yield the_runner.driver.processor if block_given?
108
+ the_runner.boot!(settings)
109
+ end.driver
109
110
  end
110
111
  alias_method :processor, :unit_test_runner
111
112
 
@@ -4,22 +4,12 @@ module Wukong
4
4
 
5
5
  include Wukong::DriverMethods
6
6
 
7
- def initialize label, settings
7
+ def initialize label, settings={}
8
8
  super()
9
- @settings = settings
10
- @dataflow = construct_dataflow(label, settings)
9
+ construct_dataflow(label, settings)
11
10
  setup_dataflow
12
11
  end
13
12
 
14
- def setup
15
- end
16
-
17
- def finalize
18
- end
19
-
20
- def stop
21
- end
22
-
23
13
  def process output
24
14
  self << output
25
15
  end
@@ -27,14 +17,14 @@ module Wukong
27
17
  def run
28
18
  return false unless dataflow
29
19
  given_records.each do |input|
30
- driver.send_through_dataflow(input)
20
+ send_through_dataflow(input)
31
21
  end
32
22
  finalize_and_stop_dataflow
33
23
  self
34
24
  end
35
25
 
36
26
  def processor
37
- dataflow.first
27
+ dataflow.root
38
28
  end
39
29
 
40
30
  # An array of accumulated records to process come match-time.
@@ -19,29 +19,28 @@ module Wukong
19
19
  # of the unit test back into the test suite
20
20
  class UnitTestRunner < Wukong::Local::LocalRunner
21
21
 
22
- # The processor this runner will create in the same way as
23
- # `wu-local`.
24
- attr_accessor :processor
25
-
26
22
  # Initialize a new UnitTestRunner for the processor with the
27
23
  # given `label` and `settings`.
28
24
  #
29
25
  # @param [Symbol] label
30
26
  # @param [Hash] settings
31
- def initialize label, settings
32
- self.processor = label
27
+ def initialize label, settings={}
28
+ @dataflow = label
33
29
  params = Configliere::Param.new
34
- params.use(:commandline)
35
30
  params.merge!(settings)
36
31
  super(params)
37
32
  end
38
33
 
34
+ def dataflow
35
+ @dataflow
36
+ end
37
+
39
38
  # Override the LocalDriver with the UnitTestDriver so we can
40
39
  # more easily pass in and retrieve processed records.
41
40
  #
42
41
  # @return [UnitTestDriver]
43
42
  def driver
44
- @driver ||= UnitTestDriver.new(processor, settings)
43
+ @driver ||= UnitTestDriver.new(dataflow, settings)
45
44
  end
46
45
 
47
46
  # No need to load commandline arguments when we are testing
@@ -1,3 +1,3 @@
1
1
  module Wukong
2
- VERSION = '3.0.1'
2
+ VERSION = '4.0.0'
3
3
  end
@@ -0,0 +1,55 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget that yields whatever you instantiate it with.
5
+ #
6
+ # This is most useful when you have a small but predictable input
7
+ # that you don't want or can't pass via usual input channels like
8
+ # STDIN.
9
+ #
10
+ # @example Works just like you think on the command line
11
+ #
12
+ # $ echo something else | wu-local echo --input=hello
13
+ # hello
14
+ #
15
+ # @example Pass some fixed input to your downstream code.
16
+ #
17
+ # # my_flow.rb
18
+ # Wukong.dataflow(:my_flow) do
19
+ # echo(input: {key: 'value'}) | my_proc | ...
20
+ # end
21
+ #
22
+ # This differs from from the `:identity` processor because it
23
+ # doesn't pass on what it receives but what you instantiate it
24
+ # with.
25
+ #
26
+ # @see Identity
27
+ class Echo < Processor
28
+
29
+ description <<EOF
30
+ A widget that yields whatever you instantiate it with.
31
+
32
+ This is most useful when you have a small but predictable input
33
+ that you don't want or can't pass via usual input channels like
34
+ STDIN.
35
+
36
+ Works just like you think on the command line (the process won't terminate)
37
+
38
+ $ echo something else | wu-local echo --input=hello
39
+ hello
40
+ EOF
41
+
42
+ field :input, Whatever, :default => nil, :doc => "The record to echo"
43
+
44
+ # Yields the `input` no matter what you pass it.
45
+ #
46
+ # @param [Object] _ the new input record which is ignored
47
+ # @yield [input]
48
+ # @yieldparam [Object] input the original input
49
+ def process _
50
+ yield input
51
+ end
52
+ register
53
+ end
54
+ end
55
+ end
@@ -3,58 +3,6 @@ require_relative('utils')
3
3
  module Wukong
4
4
  class Processor
5
5
 
6
- # A widget that will log all incoming records.
7
- #
8
- # @example Logging records from the command line
9
- #
10
- # $ cat input
11
- # 1
12
- # 2
13
- # 3
14
- # $ cat input | wu-local logger
15
- # 2012-11-28 18:20:46 [INFO] Logger: 1
16
- # 2012-11-28 18:20:46 [INFO] Logger: 2
17
- # 2012-11-28 18:20:46 [INFO] Logger: 3
18
- #
19
- # @example Logging records within a dataflow
20
- #
21
- # Wukong.dataflow(:uses_logger) do
22
- # ... | logger
23
- # end
24
- class Logger < Processor
25
- field :level, Symbol, :default => :info, :doc => "Log level priority"
26
-
27
- description <<EOF
28
- This processor passes all input records unmodified, making a log
29
- statement on each one.
30
-
31
- $ cat input
32
- 1
33
- 2
34
- 3
35
- $ cat input | wu-local logger
36
- INFO 2013-01-04 17:10:59 [Logger ] -- 1
37
- INFO 2013-01-04 17:10:59 [Logger ] -- 2
38
- INFO 2013-01-04 17:10:59 [Logger ] -- 3
39
-
40
- You can set the priority level of the log messages with the --level
41
- flag.
42
-
43
- $ cat input | wu-local logger --level=debug
44
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
45
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
46
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
47
- EOF
48
-
49
- # Process a given `record` by logging it.
50
- #
51
- # @param [Object] record
52
- def process(record)
53
- log.send(level, record)
54
- end
55
- register
56
- end
57
-
58
6
  # A widget that extracts parts of incoming records.
59
7
  #
60
8
  # This widget can extract part of the following kinds of objects:
@@ -170,59 +118,5 @@ EOF
170
118
  end
171
119
  register
172
120
  end
173
-
174
- class Topic < Processor
175
-
176
- field :topic, Symbol, :doc => "Topic to label the record with"
177
-
178
- def process(record)
179
- yield perform_action(record)
180
- end
181
-
182
- def perform_action(record)
183
- assign_topic(record, topic)
184
- end
185
-
186
- def assign_topic(record, topic_name)
187
- record.define_singleton_method(:topic){ topic_name }
188
- record
189
- end
190
- register
191
- end
192
-
193
- # Until further notice, this processor is unusable due to the invocation of yield
194
- # class Foreach < Processor
195
- # def process(record, &blk)
196
- # perform_action(record, &blk)
197
- # end
198
- # register
199
- # end
200
-
201
- class Map < Processor
202
- def process(record)
203
- yield perform_action(record)
204
- end
205
- register
206
- end
207
-
208
- class Flatten < Processor
209
- def process(records)
210
- records.respond_to?(:each) ? records.each{ |record| yield(record) } : yield(records)
211
- end
212
- register
213
- end
214
-
215
- # Mixin processor behavior
216
- module BufferedProcessor
217
- def setup() ; end
218
- def process(record) @buffer << record ; end
219
- def stop() ; end
220
- end
221
-
222
- module StdoutProcessor
223
- def setup() $stdout.sync ; end
224
- def process(record) $stdout.puts record ; end
225
- def stop() ; end
226
- end
227
121
  end
228
122
  end
@@ -432,6 +432,21 @@ EOF
432
432
  end
433
433
  register
434
434
  end
435
+
436
+ # Select a record only if it is non-nil.
437
+ #
438
+ # @see Filter
439
+ class Compact < Filter
440
+
441
+ # Select a record only if it is non-nil.
442
+ #
443
+ # @param [Object] record
444
+ # @return [true, false]
445
+ def select?(record)
446
+ ! record.nil?
447
+ end
448
+ register
449
+ end
435
450
 
436
451
  end
437
452
  end
@@ -0,0 +1,56 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget that will log all incoming records.
5
+ #
6
+ # @example Logging records from the command line
7
+ #
8
+ # $ cat input
9
+ # 1
10
+ # 2
11
+ # 3
12
+ # $ cat input | wu-local logger
13
+ # 2012-11-28 18:20:46 [INFO] Logger: 1
14
+ # 2012-11-28 18:20:46 [INFO] Logger: 2
15
+ # 2012-11-28 18:20:46 [INFO] Logger: 3
16
+ #
17
+ # @example Logging records within a dataflow
18
+ #
19
+ # Wukong.dataflow(:uses_logger) do
20
+ # ... | logger
21
+ # end
22
+ class Logger < Processor
23
+ field :level, Symbol, :default => :info, :doc => "Log level priority"
24
+
25
+ description <<EOF
26
+ This processor passes all input records unmodified, making a log
27
+ statement on each one.
28
+
29
+ $ cat input
30
+ 1
31
+ 2
32
+ 3
33
+ $ cat input | wu-local logger
34
+ INFO 2013-01-04 17:10:59 [Logger ] -- 1
35
+ INFO 2013-01-04 17:10:59 [Logger ] -- 2
36
+ INFO 2013-01-04 17:10:59 [Logger ] -- 3
37
+
38
+ You can set the priority level of the log messages with the --level
39
+ flag.
40
+
41
+ $ cat input | wu-local logger --level=debug
42
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
43
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
44
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
45
+ EOF
46
+
47
+ # Process a given `record` by logging it.
48
+ #
49
+ # @param [Object] record
50
+ def process(record)
51
+ log.send(level, record)
52
+ end
53
+ register
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,82 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # Yield the result of this processor's action for each input
5
+ # record.
6
+ #
7
+ # @example Apply a function (like a parser) to each record
8
+ #
9
+ # Wukong.dataflow(:parser) do
10
+ # ... | map { |string| MyParser.parse(string) } | ...
11
+ # end
12
+ #
13
+ # @example Succintly map between objects
14
+ #
15
+ # Wukong.dataflow(:converter) do
16
+ # ... | my_book_parser | map(&:author) | author_processor | ...
17
+ # end
18
+ #
19
+ # Can also be called with the :compact option which will check if
20
+ # the result of the action is non falsy before yielding.
21
+ #
22
+ # @example Mapping but only if it exists
23
+ #
24
+ # Wukong.dataflow(:converter_and_trimmer) do
25
+ # ... | my_book_parser | map(compact: true, &:author) | processor_that_needs_an_author | ...
26
+ # end
27
+ class Map < Processor
28
+
29
+ field :compact, :boolean, default: false
30
+
31
+ # Call #perform_action on the input_record and yield the
32
+ # returned output record.
33
+ #
34
+ # If #compact then only yield the output record if it is not
35
+ # falsy.
36
+ #
37
+ # @param [Object] input_record
38
+ # @yield [output_record] if compact, then only yield if it is not falsy
39
+ # @yieldparam [Object] output_record the result of #perform_action
40
+ #
41
+ # @see Flatten
42
+ def process(input_record)
43
+ output_record = perform_action(input_record)
44
+ if compact
45
+ yield output_record if output_record
46
+ else
47
+ yield output_record
48
+ end
49
+ end
50
+ register
51
+ end
52
+
53
+ # If an input record defines the #each method then yield each of
54
+ # its records. Otherwise yield the input record.
55
+ #
56
+ # @example Turning one record into many
57
+ #
58
+ # Wukong.dataflow(:authors_to_books) do
59
+ # ... | author_parser | map(&:books) | flatten | book_processor | ...
60
+ # end
61
+ #
62
+ # @see Map
63
+ class Flatten < Processor
64
+
65
+ # If input_record responds to #each then yield each of these as
66
+ # an output record. Else, just yield the input_record.
67
+ #
68
+ # @param [Object] input_record
69
+ # @yield [output_record]
70
+ # @yieldparam [Object] output_record
71
+ def process(input_record)
72
+ if input_record.respond_to?(:each)
73
+ input_record.each{ |output_record| yield(output_record) }
74
+ else
75
+ yield(input_record)
76
+ end
77
+ end
78
+ register
79
+ end
80
+
81
+ end
82
+ end