wukong 3.0.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -6,7 +6,6 @@ shared_examples_for 'a processor' do |options = {}|
6
6
  it{ processor(options[:named]).processor.should respond_to(:process) }
7
7
  it{ processor(options[:named]).processor.should respond_to(:finalize) }
8
8
  it{ processor(options[:named]).processor.should respond_to(:stop) }
9
- it{ processor(options[:named]).processor.should respond_to(:notify) }
10
9
  end
11
10
 
12
11
  shared_examples_for 'a plugin' do |options = {}|
@@ -99,13 +99,14 @@ module Wukong
99
99
  # let(:french_tokenizer) { processor(:complex_tokenizer, stemming: true, language: 'fr') }
100
100
  # ...
101
101
  # end
102
- def unit_test_runner *args
102
+ def unit_test_runner *args, &block
103
103
  settings = args.extract_options!
104
104
  name = (args.first || self.class.description)
105
- runner = UnitTestRunner.new(name, settings)
106
- yield runner.driver.processor if block_given?
107
- runner.boot!
108
- runner.driver
105
+ UnitTestRunner.new(name, settings).tap do |the_runner|
106
+ the_runner.program_name = 'wu-local'
107
+ yield the_runner.driver.processor if block_given?
108
+ the_runner.boot!(settings)
109
+ end.driver
109
110
  end
110
111
  alias_method :processor, :unit_test_runner
111
112
 
@@ -4,22 +4,12 @@ module Wukong
4
4
 
5
5
  include Wukong::DriverMethods
6
6
 
7
- def initialize label, settings
7
+ def initialize label, settings={}
8
8
  super()
9
- @settings = settings
10
- @dataflow = construct_dataflow(label, settings)
9
+ construct_dataflow(label, settings)
11
10
  setup_dataflow
12
11
  end
13
12
 
14
- def setup
15
- end
16
-
17
- def finalize
18
- end
19
-
20
- def stop
21
- end
22
-
23
13
  def process output
24
14
  self << output
25
15
  end
@@ -27,14 +17,14 @@ module Wukong
27
17
  def run
28
18
  return false unless dataflow
29
19
  given_records.each do |input|
30
- driver.send_through_dataflow(input)
20
+ send_through_dataflow(input)
31
21
  end
32
22
  finalize_and_stop_dataflow
33
23
  self
34
24
  end
35
25
 
36
26
  def processor
37
- dataflow.first
27
+ dataflow.root
38
28
  end
39
29
 
40
30
  # An array of accumulated records to process come match-time.
@@ -19,29 +19,28 @@ module Wukong
19
19
  # of the unit test back into the test suite
20
20
  class UnitTestRunner < Wukong::Local::LocalRunner
21
21
 
22
- # The processor this runner will create in the same way as
23
- # `wu-local`.
24
- attr_accessor :processor
25
-
26
22
  # Initialize a new UnitTestRunner for the processor with the
27
23
  # given `label` and `settings`.
28
24
  #
29
25
  # @param [Symbol] label
30
26
  # @param [Hash] settings
31
- def initialize label, settings
32
- self.processor = label
27
+ def initialize label, settings={}
28
+ @dataflow = label
33
29
  params = Configliere::Param.new
34
- params.use(:commandline)
35
30
  params.merge!(settings)
36
31
  super(params)
37
32
  end
38
33
 
34
+ def dataflow
35
+ @dataflow
36
+ end
37
+
39
38
  # Override the LocalDriver with the UnitTestDriver so we can
40
39
  # more easily pass in and retrieve processed records.
41
40
  #
42
41
  # @return [UnitTestDriver]
43
42
  def driver
44
- @driver ||= UnitTestDriver.new(processor, settings)
43
+ @driver ||= UnitTestDriver.new(dataflow, settings)
45
44
  end
46
45
 
47
46
  # No need to load commandline arguments when we are testing
@@ -1,3 +1,3 @@
1
1
  module Wukong
2
- VERSION = '3.0.1'
2
+ VERSION = '4.0.0'
3
3
  end
@@ -0,0 +1,55 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget that yields whatever you instantiate it with.
5
+ #
6
+ # This is most useful when you have a small but predictable input
7
+ # that you don't want or can't pass via usual input channels like
8
+ # STDIN.
9
+ #
10
+ # @example Works just like you think on the command line
11
+ #
12
+ # $ echo something else | wu-local echo --input=hello
13
+ # hello
14
+ #
15
+ # @example Pass some fixed input to your downstream code.
16
+ #
17
+ # # my_flow.rb
18
+ # Wukong.dataflow(:my_flow) do
19
+ # echo(input: {key: 'value'}) | my_proc | ...
20
+ # end
21
+ #
22
+ # This differs from from the `:identity` processor because it
23
+ # doesn't pass on what it receives but what you instantiate it
24
+ # with.
25
+ #
26
+ # @see Identity
27
+ class Echo < Processor
28
+
29
+ description <<EOF
30
+ A widget that yields whatever you instantiate it with.
31
+
32
+ This is most useful when you have a small but predictable input
33
+ that you don't want or can't pass via usual input channels like
34
+ STDIN.
35
+
36
+ Works just like you think on the command line (the process won't terminate)
37
+
38
+ $ echo something else | wu-local echo --input=hello
39
+ hello
40
+ EOF
41
+
42
+ field :input, Whatever, :default => nil, :doc => "The record to echo"
43
+
44
+ # Yields the `input` no matter what you pass it.
45
+ #
46
+ # @param [Object] _ the new input record which is ignored
47
+ # @yield [input]
48
+ # @yieldparam [Object] input the original input
49
+ def process _
50
+ yield input
51
+ end
52
+ register
53
+ end
54
+ end
55
+ end
@@ -3,58 +3,6 @@ require_relative('utils')
3
3
  module Wukong
4
4
  class Processor
5
5
 
6
- # A widget that will log all incoming records.
7
- #
8
- # @example Logging records from the command line
9
- #
10
- # $ cat input
11
- # 1
12
- # 2
13
- # 3
14
- # $ cat input | wu-local logger
15
- # 2012-11-28 18:20:46 [INFO] Logger: 1
16
- # 2012-11-28 18:20:46 [INFO] Logger: 2
17
- # 2012-11-28 18:20:46 [INFO] Logger: 3
18
- #
19
- # @example Logging records within a dataflow
20
- #
21
- # Wukong.dataflow(:uses_logger) do
22
- # ... | logger
23
- # end
24
- class Logger < Processor
25
- field :level, Symbol, :default => :info, :doc => "Log level priority"
26
-
27
- description <<EOF
28
- This processor passes all input records unmodified, making a log
29
- statement on each one.
30
-
31
- $ cat input
32
- 1
33
- 2
34
- 3
35
- $ cat input | wu-local logger
36
- INFO 2013-01-04 17:10:59 [Logger ] -- 1
37
- INFO 2013-01-04 17:10:59 [Logger ] -- 2
38
- INFO 2013-01-04 17:10:59 [Logger ] -- 3
39
-
40
- You can set the priority level of the log messages with the --level
41
- flag.
42
-
43
- $ cat input | wu-local logger --level=debug
44
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
45
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
46
- DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
47
- EOF
48
-
49
- # Process a given `record` by logging it.
50
- #
51
- # @param [Object] record
52
- def process(record)
53
- log.send(level, record)
54
- end
55
- register
56
- end
57
-
58
6
  # A widget that extracts parts of incoming records.
59
7
  #
60
8
  # This widget can extract part of the following kinds of objects:
@@ -170,59 +118,5 @@ EOF
170
118
  end
171
119
  register
172
120
  end
173
-
174
- class Topic < Processor
175
-
176
- field :topic, Symbol, :doc => "Topic to label the record with"
177
-
178
- def process(record)
179
- yield perform_action(record)
180
- end
181
-
182
- def perform_action(record)
183
- assign_topic(record, topic)
184
- end
185
-
186
- def assign_topic(record, topic_name)
187
- record.define_singleton_method(:topic){ topic_name }
188
- record
189
- end
190
- register
191
- end
192
-
193
- # Until further notice, this processor is unusable due to the invocation of yield
194
- # class Foreach < Processor
195
- # def process(record, &blk)
196
- # perform_action(record, &blk)
197
- # end
198
- # register
199
- # end
200
-
201
- class Map < Processor
202
- def process(record)
203
- yield perform_action(record)
204
- end
205
- register
206
- end
207
-
208
- class Flatten < Processor
209
- def process(records)
210
- records.respond_to?(:each) ? records.each{ |record| yield(record) } : yield(records)
211
- end
212
- register
213
- end
214
-
215
- # Mixin processor behavior
216
- module BufferedProcessor
217
- def setup() ; end
218
- def process(record) @buffer << record ; end
219
- def stop() ; end
220
- end
221
-
222
- module StdoutProcessor
223
- def setup() $stdout.sync ; end
224
- def process(record) $stdout.puts record ; end
225
- def stop() ; end
226
- end
227
121
  end
228
122
  end
@@ -432,6 +432,21 @@ EOF
432
432
  end
433
433
  register
434
434
  end
435
+
436
+ # Select a record only if it is non-nil.
437
+ #
438
+ # @see Filter
439
+ class Compact < Filter
440
+
441
+ # Select a record only if it is non-nil.
442
+ #
443
+ # @param [Object] record
444
+ # @return [true, false]
445
+ def select?(record)
446
+ ! record.nil?
447
+ end
448
+ register
449
+ end
435
450
 
436
451
  end
437
452
  end
@@ -0,0 +1,56 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget that will log all incoming records.
5
+ #
6
+ # @example Logging records from the command line
7
+ #
8
+ # $ cat input
9
+ # 1
10
+ # 2
11
+ # 3
12
+ # $ cat input | wu-local logger
13
+ # 2012-11-28 18:20:46 [INFO] Logger: 1
14
+ # 2012-11-28 18:20:46 [INFO] Logger: 2
15
+ # 2012-11-28 18:20:46 [INFO] Logger: 3
16
+ #
17
+ # @example Logging records within a dataflow
18
+ #
19
+ # Wukong.dataflow(:uses_logger) do
20
+ # ... | logger
21
+ # end
22
+ class Logger < Processor
23
+ field :level, Symbol, :default => :info, :doc => "Log level priority"
24
+
25
+ description <<EOF
26
+ This processor passes all input records unmodified, making a log
27
+ statement on each one.
28
+
29
+ $ cat input
30
+ 1
31
+ 2
32
+ 3
33
+ $ cat input | wu-local logger
34
+ INFO 2013-01-04 17:10:59 [Logger ] -- 1
35
+ INFO 2013-01-04 17:10:59 [Logger ] -- 2
36
+ INFO 2013-01-04 17:10:59 [Logger ] -- 3
37
+
38
+ You can set the priority level of the log messages with the --level
39
+ flag.
40
+
41
+ $ cat input | wu-local logger --level=debug
42
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
43
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
44
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
45
+ EOF
46
+
47
+ # Process a given `record` by logging it.
48
+ #
49
+ # @param [Object] record
50
+ def process(record)
51
+ log.send(level, record)
52
+ end
53
+ register
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,82 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # Yield the result of this processor's action for each input
5
+ # record.
6
+ #
7
+ # @example Apply a function (like a parser) to each record
8
+ #
9
+ # Wukong.dataflow(:parser) do
10
+ # ... | map { |string| MyParser.parse(string) } | ...
11
+ # end
12
+ #
13
+ # @example Succintly map between objects
14
+ #
15
+ # Wukong.dataflow(:converter) do
16
+ # ... | my_book_parser | map(&:author) | author_processor | ...
17
+ # end
18
+ #
19
+ # Can also be called with the :compact option which will check if
20
+ # the result of the action is non falsy before yielding.
21
+ #
22
+ # @example Mapping but only if it exists
23
+ #
24
+ # Wukong.dataflow(:converter_and_trimmer) do
25
+ # ... | my_book_parser | map(compact: true, &:author) | processor_that_needs_an_author | ...
26
+ # end
27
+ class Map < Processor
28
+
29
+ field :compact, :boolean, default: false
30
+
31
+ # Call #perform_action on the input_record and yield the
32
+ # returned output record.
33
+ #
34
+ # If #compact then only yield the output record if it is not
35
+ # falsy.
36
+ #
37
+ # @param [Object] input_record
38
+ # @yield [output_record] if compact, then only yield if it is not falsy
39
+ # @yieldparam [Object] output_record the result of #perform_action
40
+ #
41
+ # @see Flatten
42
+ def process(input_record)
43
+ output_record = perform_action(input_record)
44
+ if compact
45
+ yield output_record if output_record
46
+ else
47
+ yield output_record
48
+ end
49
+ end
50
+ register
51
+ end
52
+
53
+ # If an input record defines the #each method then yield each of
54
+ # its records. Otherwise yield the input record.
55
+ #
56
+ # @example Turning one record into many
57
+ #
58
+ # Wukong.dataflow(:authors_to_books) do
59
+ # ... | author_parser | map(&:books) | flatten | book_processor | ...
60
+ # end
61
+ #
62
+ # @see Map
63
+ class Flatten < Processor
64
+
65
+ # If input_record responds to #each then yield each of these as
66
+ # an output record. Else, just yield the input_record.
67
+ #
68
+ # @param [Object] input_record
69
+ # @yield [output_record]
70
+ # @yieldparam [Object] output_record
71
+ def process(input_record)
72
+ if input_record.respond_to?(:each)
73
+ input_record.each{ |output_record| yield(output_record) }
74
+ else
75
+ yield(input_record)
76
+ end
77
+ end
78
+ register
79
+ end
80
+
81
+ end
82
+ end