wukong 3.0.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +1 -1
- data/README.md +253 -45
- data/bin/wu +34 -0
- data/bin/wu-source +5 -0
- data/examples/Gemfile +0 -1
- data/examples/deploy_pack/Gemfile +0 -1
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/ruby_project/Gemfile +0 -1
- data/examples/splitter.rb +94 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +1 -1
- data/lib/hanuman/graph.rb +39 -22
- data/lib/hanuman/stage.rb +46 -13
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wukong.rb +6 -1
- data/lib/wukong/dataflow.rb +19 -48
- data/lib/wukong/driver.rb +176 -65
- data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +6 -4
- data/lib/wukong/local/runner.rb +14 -16
- data/lib/wukong/local/stdio_driver.rb +72 -12
- data/lib/wukong/processor.rb +1 -30
- data/lib/wukong/runner.rb +2 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
- data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
- data/lib/wukong/widget/filters.rb +15 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +2 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/serializers.rb +21 -6
- data/lib/wukong/widgets.rb +6 -3
- data/spec/hanuman/graph_spec.rb +73 -10
- data/spec/hanuman/stage_spec.rb +15 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec_helper.rb +13 -1
- data/spec/support/example_test_helpers.rb +0 -1
- data/spec/support/model_test_helpers.rb +1 -1
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_examples_for_builders.rb +8 -15
- data/spec/wukong/driver_spec.rb +152 -0
- data/spec/wukong/local/runner_spec.rb +1 -12
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/processor_spec.rb +0 -1
- data/spec/wukong/runner_spec.rb +2 -2
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/wukong.gemspec +1 -2
- metadata +45 -28
- data/lib/wukong/local/tcp_driver.rb +0 -47
- data/spec/wu/geo/geolocated_spec.rb +0 -247
- data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -6,7 +6,6 @@ shared_examples_for 'a processor' do |options = {}|
|
|
6
6
|
it{ processor(options[:named]).processor.should respond_to(:process) }
|
7
7
|
it{ processor(options[:named]).processor.should respond_to(:finalize) }
|
8
8
|
it{ processor(options[:named]).processor.should respond_to(:stop) }
|
9
|
-
it{ processor(options[:named]).processor.should respond_to(:notify) }
|
10
9
|
end
|
11
10
|
|
12
11
|
shared_examples_for 'a plugin' do |options = {}|
|
@@ -99,13 +99,14 @@ module Wukong
|
|
99
99
|
# let(:french_tokenizer) { processor(:complex_tokenizer, stemming: true, language: 'fr') }
|
100
100
|
# ...
|
101
101
|
# end
|
102
|
-
def unit_test_runner *args
|
102
|
+
def unit_test_runner *args, &block
|
103
103
|
settings = args.extract_options!
|
104
104
|
name = (args.first || self.class.description)
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
105
|
+
UnitTestRunner.new(name, settings).tap do |the_runner|
|
106
|
+
the_runner.program_name = 'wu-local'
|
107
|
+
yield the_runner.driver.processor if block_given?
|
108
|
+
the_runner.boot!(settings)
|
109
|
+
end.driver
|
109
110
|
end
|
110
111
|
alias_method :processor, :unit_test_runner
|
111
112
|
|
@@ -4,22 +4,12 @@ module Wukong
|
|
4
4
|
|
5
5
|
include Wukong::DriverMethods
|
6
6
|
|
7
|
-
def initialize label, settings
|
7
|
+
def initialize label, settings={}
|
8
8
|
super()
|
9
|
-
|
10
|
-
@dataflow = construct_dataflow(label, settings)
|
9
|
+
construct_dataflow(label, settings)
|
11
10
|
setup_dataflow
|
12
11
|
end
|
13
12
|
|
14
|
-
def setup
|
15
|
-
end
|
16
|
-
|
17
|
-
def finalize
|
18
|
-
end
|
19
|
-
|
20
|
-
def stop
|
21
|
-
end
|
22
|
-
|
23
13
|
def process output
|
24
14
|
self << output
|
25
15
|
end
|
@@ -27,14 +17,14 @@ module Wukong
|
|
27
17
|
def run
|
28
18
|
return false unless dataflow
|
29
19
|
given_records.each do |input|
|
30
|
-
|
20
|
+
send_through_dataflow(input)
|
31
21
|
end
|
32
22
|
finalize_and_stop_dataflow
|
33
23
|
self
|
34
24
|
end
|
35
25
|
|
36
26
|
def processor
|
37
|
-
dataflow.
|
27
|
+
dataflow.root
|
38
28
|
end
|
39
29
|
|
40
30
|
# An array of accumulated records to process come match-time.
|
@@ -19,29 +19,28 @@ module Wukong
|
|
19
19
|
# of the unit test back into the test suite
|
20
20
|
class UnitTestRunner < Wukong::Local::LocalRunner
|
21
21
|
|
22
|
-
# The processor this runner will create in the same way as
|
23
|
-
# `wu-local`.
|
24
|
-
attr_accessor :processor
|
25
|
-
|
26
22
|
# Initialize a new UnitTestRunner for the processor with the
|
27
23
|
# given `label` and `settings`.
|
28
24
|
#
|
29
25
|
# @param [Symbol] label
|
30
26
|
# @param [Hash] settings
|
31
|
-
def initialize label, settings
|
32
|
-
|
27
|
+
def initialize label, settings={}
|
28
|
+
@dataflow = label
|
33
29
|
params = Configliere::Param.new
|
34
|
-
params.use(:commandline)
|
35
30
|
params.merge!(settings)
|
36
31
|
super(params)
|
37
32
|
end
|
38
33
|
|
34
|
+
def dataflow
|
35
|
+
@dataflow
|
36
|
+
end
|
37
|
+
|
39
38
|
# Override the LocalDriver with the UnitTestDriver so we can
|
40
39
|
# more easily pass in and retrieve processed records.
|
41
40
|
#
|
42
41
|
# @return [UnitTestDriver]
|
43
42
|
def driver
|
44
|
-
@driver ||= UnitTestDriver.new(
|
43
|
+
@driver ||= UnitTestDriver.new(dataflow, settings)
|
45
44
|
end
|
46
45
|
|
47
46
|
# No need to load commandline arguments when we are testing
|
data/lib/wukong/version.rb
CHANGED
@@ -0,0 +1,55 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A widget that yields whatever you instantiate it with.
|
5
|
+
#
|
6
|
+
# This is most useful when you have a small but predictable input
|
7
|
+
# that you don't want or can't pass via usual input channels like
|
8
|
+
# STDIN.
|
9
|
+
#
|
10
|
+
# @example Works just like you think on the command line
|
11
|
+
#
|
12
|
+
# $ echo something else | wu-local echo --input=hello
|
13
|
+
# hello
|
14
|
+
#
|
15
|
+
# @example Pass some fixed input to your downstream code.
|
16
|
+
#
|
17
|
+
# # my_flow.rb
|
18
|
+
# Wukong.dataflow(:my_flow) do
|
19
|
+
# echo(input: {key: 'value'}) | my_proc | ...
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# This differs from from the `:identity` processor because it
|
23
|
+
# doesn't pass on what it receives but what you instantiate it
|
24
|
+
# with.
|
25
|
+
#
|
26
|
+
# @see Identity
|
27
|
+
class Echo < Processor
|
28
|
+
|
29
|
+
description <<EOF
|
30
|
+
A widget that yields whatever you instantiate it with.
|
31
|
+
|
32
|
+
This is most useful when you have a small but predictable input
|
33
|
+
that you don't want or can't pass via usual input channels like
|
34
|
+
STDIN.
|
35
|
+
|
36
|
+
Works just like you think on the command line (the process won't terminate)
|
37
|
+
|
38
|
+
$ echo something else | wu-local echo --input=hello
|
39
|
+
hello
|
40
|
+
EOF
|
41
|
+
|
42
|
+
field :input, Whatever, :default => nil, :doc => "The record to echo"
|
43
|
+
|
44
|
+
# Yields the `input` no matter what you pass it.
|
45
|
+
#
|
46
|
+
# @param [Object] _ the new input record which is ignored
|
47
|
+
# @yield [input]
|
48
|
+
# @yieldparam [Object] input the original input
|
49
|
+
def process _
|
50
|
+
yield input
|
51
|
+
end
|
52
|
+
register
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -3,58 +3,6 @@ require_relative('utils')
|
|
3
3
|
module Wukong
|
4
4
|
class Processor
|
5
5
|
|
6
|
-
# A widget that will log all incoming records.
|
7
|
-
#
|
8
|
-
# @example Logging records from the command line
|
9
|
-
#
|
10
|
-
# $ cat input
|
11
|
-
# 1
|
12
|
-
# 2
|
13
|
-
# 3
|
14
|
-
# $ cat input | wu-local logger
|
15
|
-
# 2012-11-28 18:20:46 [INFO] Logger: 1
|
16
|
-
# 2012-11-28 18:20:46 [INFO] Logger: 2
|
17
|
-
# 2012-11-28 18:20:46 [INFO] Logger: 3
|
18
|
-
#
|
19
|
-
# @example Logging records within a dataflow
|
20
|
-
#
|
21
|
-
# Wukong.dataflow(:uses_logger) do
|
22
|
-
# ... | logger
|
23
|
-
# end
|
24
|
-
class Logger < Processor
|
25
|
-
field :level, Symbol, :default => :info, :doc => "Log level priority"
|
26
|
-
|
27
|
-
description <<EOF
|
28
|
-
This processor passes all input records unmodified, making a log
|
29
|
-
statement on each one.
|
30
|
-
|
31
|
-
$ cat input
|
32
|
-
1
|
33
|
-
2
|
34
|
-
3
|
35
|
-
$ cat input | wu-local logger
|
36
|
-
INFO 2013-01-04 17:10:59 [Logger ] -- 1
|
37
|
-
INFO 2013-01-04 17:10:59 [Logger ] -- 2
|
38
|
-
INFO 2013-01-04 17:10:59 [Logger ] -- 3
|
39
|
-
|
40
|
-
You can set the priority level of the log messages with the --level
|
41
|
-
flag.
|
42
|
-
|
43
|
-
$ cat input | wu-local logger --level=debug
|
44
|
-
DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
|
45
|
-
DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
|
46
|
-
DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
|
47
|
-
EOF
|
48
|
-
|
49
|
-
# Process a given `record` by logging it.
|
50
|
-
#
|
51
|
-
# @param [Object] record
|
52
|
-
def process(record)
|
53
|
-
log.send(level, record)
|
54
|
-
end
|
55
|
-
register
|
56
|
-
end
|
57
|
-
|
58
6
|
# A widget that extracts parts of incoming records.
|
59
7
|
#
|
60
8
|
# This widget can extract part of the following kinds of objects:
|
@@ -170,59 +118,5 @@ EOF
|
|
170
118
|
end
|
171
119
|
register
|
172
120
|
end
|
173
|
-
|
174
|
-
class Topic < Processor
|
175
|
-
|
176
|
-
field :topic, Symbol, :doc => "Topic to label the record with"
|
177
|
-
|
178
|
-
def process(record)
|
179
|
-
yield perform_action(record)
|
180
|
-
end
|
181
|
-
|
182
|
-
def perform_action(record)
|
183
|
-
assign_topic(record, topic)
|
184
|
-
end
|
185
|
-
|
186
|
-
def assign_topic(record, topic_name)
|
187
|
-
record.define_singleton_method(:topic){ topic_name }
|
188
|
-
record
|
189
|
-
end
|
190
|
-
register
|
191
|
-
end
|
192
|
-
|
193
|
-
# Until further notice, this processor is unusable due to the invocation of yield
|
194
|
-
# class Foreach < Processor
|
195
|
-
# def process(record, &blk)
|
196
|
-
# perform_action(record, &blk)
|
197
|
-
# end
|
198
|
-
# register
|
199
|
-
# end
|
200
|
-
|
201
|
-
class Map < Processor
|
202
|
-
def process(record)
|
203
|
-
yield perform_action(record)
|
204
|
-
end
|
205
|
-
register
|
206
|
-
end
|
207
|
-
|
208
|
-
class Flatten < Processor
|
209
|
-
def process(records)
|
210
|
-
records.respond_to?(:each) ? records.each{ |record| yield(record) } : yield(records)
|
211
|
-
end
|
212
|
-
register
|
213
|
-
end
|
214
|
-
|
215
|
-
# Mixin processor behavior
|
216
|
-
module BufferedProcessor
|
217
|
-
def setup() ; end
|
218
|
-
def process(record) @buffer << record ; end
|
219
|
-
def stop() ; end
|
220
|
-
end
|
221
|
-
|
222
|
-
module StdoutProcessor
|
223
|
-
def setup() $stdout.sync ; end
|
224
|
-
def process(record) $stdout.puts record ; end
|
225
|
-
def stop() ; end
|
226
|
-
end
|
227
121
|
end
|
228
122
|
end
|
@@ -432,6 +432,21 @@ EOF
|
|
432
432
|
end
|
433
433
|
register
|
434
434
|
end
|
435
|
+
|
436
|
+
# Select a record only if it is non-nil.
|
437
|
+
#
|
438
|
+
# @see Filter
|
439
|
+
class Compact < Filter
|
440
|
+
|
441
|
+
# Select a record only if it is non-nil.
|
442
|
+
#
|
443
|
+
# @param [Object] record
|
444
|
+
# @return [true, false]
|
445
|
+
def select?(record)
|
446
|
+
! record.nil?
|
447
|
+
end
|
448
|
+
register
|
449
|
+
end
|
435
450
|
|
436
451
|
end
|
437
452
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A widget that will log all incoming records.
|
5
|
+
#
|
6
|
+
# @example Logging records from the command line
|
7
|
+
#
|
8
|
+
# $ cat input
|
9
|
+
# 1
|
10
|
+
# 2
|
11
|
+
# 3
|
12
|
+
# $ cat input | wu-local logger
|
13
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 1
|
14
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 2
|
15
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 3
|
16
|
+
#
|
17
|
+
# @example Logging records within a dataflow
|
18
|
+
#
|
19
|
+
# Wukong.dataflow(:uses_logger) do
|
20
|
+
# ... | logger
|
21
|
+
# end
|
22
|
+
class Logger < Processor
|
23
|
+
field :level, Symbol, :default => :info, :doc => "Log level priority"
|
24
|
+
|
25
|
+
description <<EOF
|
26
|
+
This processor passes all input records unmodified, making a log
|
27
|
+
statement on each one.
|
28
|
+
|
29
|
+
$ cat input
|
30
|
+
1
|
31
|
+
2
|
32
|
+
3
|
33
|
+
$ cat input | wu-local logger
|
34
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 1
|
35
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 2
|
36
|
+
INFO 2013-01-04 17:10:59 [Logger ] -- 3
|
37
|
+
|
38
|
+
You can set the priority level of the log messages with the --level
|
39
|
+
flag.
|
40
|
+
|
41
|
+
$ cat input | wu-local logger --level=debug
|
42
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
|
43
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
|
44
|
+
DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
|
45
|
+
EOF
|
46
|
+
|
47
|
+
# Process a given `record` by logging it.
|
48
|
+
#
|
49
|
+
# @param [Object] record
|
50
|
+
def process(record)
|
51
|
+
log.send(level, record)
|
52
|
+
end
|
53
|
+
register
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# Yield the result of this processor's action for each input
|
5
|
+
# record.
|
6
|
+
#
|
7
|
+
# @example Apply a function (like a parser) to each record
|
8
|
+
#
|
9
|
+
# Wukong.dataflow(:parser) do
|
10
|
+
# ... | map { |string| MyParser.parse(string) } | ...
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# @example Succintly map between objects
|
14
|
+
#
|
15
|
+
# Wukong.dataflow(:converter) do
|
16
|
+
# ... | my_book_parser | map(&:author) | author_processor | ...
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# Can also be called with the :compact option which will check if
|
20
|
+
# the result of the action is non falsy before yielding.
|
21
|
+
#
|
22
|
+
# @example Mapping but only if it exists
|
23
|
+
#
|
24
|
+
# Wukong.dataflow(:converter_and_trimmer) do
|
25
|
+
# ... | my_book_parser | map(compact: true, &:author) | processor_that_needs_an_author | ...
|
26
|
+
# end
|
27
|
+
class Map < Processor
|
28
|
+
|
29
|
+
field :compact, :boolean, default: false
|
30
|
+
|
31
|
+
# Call #perform_action on the input_record and yield the
|
32
|
+
# returned output record.
|
33
|
+
#
|
34
|
+
# If #compact then only yield the output record if it is not
|
35
|
+
# falsy.
|
36
|
+
#
|
37
|
+
# @param [Object] input_record
|
38
|
+
# @yield [output_record] if compact, then only yield if it is not falsy
|
39
|
+
# @yieldparam [Object] output_record the result of #perform_action
|
40
|
+
#
|
41
|
+
# @see Flatten
|
42
|
+
def process(input_record)
|
43
|
+
output_record = perform_action(input_record)
|
44
|
+
if compact
|
45
|
+
yield output_record if output_record
|
46
|
+
else
|
47
|
+
yield output_record
|
48
|
+
end
|
49
|
+
end
|
50
|
+
register
|
51
|
+
end
|
52
|
+
|
53
|
+
# If an input record defines the #each method then yield each of
|
54
|
+
# its records. Otherwise yield the input record.
|
55
|
+
#
|
56
|
+
# @example Turning one record into many
|
57
|
+
#
|
58
|
+
# Wukong.dataflow(:authors_to_books) do
|
59
|
+
# ... | author_parser | map(&:books) | flatten | book_processor | ...
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# @see Map
|
63
|
+
class Flatten < Processor
|
64
|
+
|
65
|
+
# If input_record responds to #each then yield each of these as
|
66
|
+
# an output record. Else, just yield the input_record.
|
67
|
+
#
|
68
|
+
# @param [Object] input_record
|
69
|
+
# @yield [output_record]
|
70
|
+
# @yieldparam [Object] output_record
|
71
|
+
def process(input_record)
|
72
|
+
if input_record.respond_to?(:each)
|
73
|
+
input_record.each{ |output_record| yield(output_record) }
|
74
|
+
else
|
75
|
+
yield(input_record)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
register
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|