wukong 3.0.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +1 -1
- data/README.md +253 -45
- data/bin/wu +34 -0
- data/bin/wu-source +5 -0
- data/examples/Gemfile +0 -1
- data/examples/deploy_pack/Gemfile +0 -1
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/ruby_project/Gemfile +0 -1
- data/examples/splitter.rb +94 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +1 -1
- data/lib/hanuman/graph.rb +39 -22
- data/lib/hanuman/stage.rb +46 -13
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wukong.rb +6 -1
- data/lib/wukong/dataflow.rb +19 -48
- data/lib/wukong/driver.rb +176 -65
- data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +6 -4
- data/lib/wukong/local/runner.rb +14 -16
- data/lib/wukong/local/stdio_driver.rb +72 -12
- data/lib/wukong/processor.rb +1 -30
- data/lib/wukong/runner.rb +2 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
- data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
- data/lib/wukong/widget/filters.rb +15 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +2 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/serializers.rb +21 -6
- data/lib/wukong/widgets.rb +6 -3
- data/spec/hanuman/graph_spec.rb +73 -10
- data/spec/hanuman/stage_spec.rb +15 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec_helper.rb +13 -1
- data/spec/support/example_test_helpers.rb +0 -1
- data/spec/support/model_test_helpers.rb +1 -1
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_examples_for_builders.rb +8 -15
- data/spec/wukong/driver_spec.rb +152 -0
- data/spec/wukong/local/runner_spec.rb +1 -12
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/processor_spec.rb +0 -1
- data/spec/wukong/runner_spec.rb +2 -2
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/wukong.gemspec +1 -2
- metadata +45 -28
- data/lib/wukong/local/tcp_driver.rb +0 -47
- data/spec/wu/geo/geolocated_spec.rb +0 -247
- data/spec/wukong/widget/processors_spec.rb +0 -125
data/bin/wu-source
ADDED
data/examples/Gemfile
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
#
|
3
|
+
# Stupidly simple class for holding a twitter user's tweet summary
|
4
|
+
#
|
5
|
+
class UserTweetSummary
|
6
|
+
attr_accessor :fields
|
7
|
+
|
8
|
+
def initialize fields
|
9
|
+
@fields = fields
|
10
|
+
end
|
11
|
+
|
12
|
+
def update tweets
|
13
|
+
deletes_this_batch = 0
|
14
|
+
tweets_this_batch = 0
|
15
|
+
tweets.each do |tweet|
|
16
|
+
deletes_this_batch += (tweet.is_deleted ? 1 : 0)
|
17
|
+
tweets_this_batch += 1
|
18
|
+
end
|
19
|
+
fields['deletes'] += deletes_this_batch
|
20
|
+
fields['tweets'] += (tweets_this_batch - deletes_this_batch)
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_json *args
|
25
|
+
fields.to_json
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Holds a tweet and whether
|
31
|
+
# or not it's been deleted
|
32
|
+
#
|
33
|
+
class Tweet
|
34
|
+
attr_accessor :is_deleted
|
35
|
+
def initialize hash
|
36
|
+
@is_deleted = hash.keys.include?("delete")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
#
|
42
|
+
# Improver processor
|
43
|
+
#
|
44
|
+
class TweetSummarizer < Wukong::Processor::Improver
|
45
|
+
attr_accessor :user_id
|
46
|
+
|
47
|
+
def zero
|
48
|
+
super
|
49
|
+
{
|
50
|
+
'tweets' => 0,
|
51
|
+
'deletes' => 0
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def accumulate record
|
56
|
+
@user_id = record[0]
|
57
|
+
json = record[1]
|
58
|
+
self.group << Tweet.new(JSON.parse(json))
|
59
|
+
end
|
60
|
+
|
61
|
+
def improve summary, deltas
|
62
|
+
UserTweetSummary.new(JSON.parse(summary)).update(deltas)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Is this necessary?
|
68
|
+
#
|
69
|
+
TweetSummarizer.register(:tweet_summarizer)
|
70
|
+
|
71
|
+
Wukong.dataflow(:summarize_tweets) do
|
72
|
+
tweet_summarizer | to_json
|
73
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'wukong'
|
2
|
+
|
3
|
+
module Verbose
|
4
|
+
|
5
|
+
def verbose?
|
6
|
+
end
|
7
|
+
|
8
|
+
def setup
|
9
|
+
# log.info("Setting up #{label}")
|
10
|
+
end
|
11
|
+
|
12
|
+
def finalize
|
13
|
+
# log.info("Finalizing #{label}")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
Wukong.processor(:upcaser) do
|
18
|
+
include Verbose
|
19
|
+
def process(string)
|
20
|
+
# log.info("#process #{string}")
|
21
|
+
yield string.upcase
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Wukong.processor(:downcaser) do
|
26
|
+
include Verbose
|
27
|
+
def process(string)
|
28
|
+
# log.info("#process #{string}")
|
29
|
+
yield string.downcase
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Wukong.processor(:tokenizer) do
|
34
|
+
include Verbose
|
35
|
+
def process string
|
36
|
+
# log.info("#process #{string}")
|
37
|
+
string.split.each { |token| yield token }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Wukong.processor(:stripper) do
|
42
|
+
include Verbose
|
43
|
+
def process(string)
|
44
|
+
# log.info("#process #{string}")
|
45
|
+
yield string.gsub(/[^\w\s]/,'')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
Wukong.processor(:devoweler) do
|
50
|
+
include Verbose
|
51
|
+
def process(string)
|
52
|
+
# log.info("#process #{string}")
|
53
|
+
yield string.gsub(/[aeiou]/i,'')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# stripper = Wukong.registry.retrieve(:stripper)
|
58
|
+
# tokenizer = Wukong.registry.retrieve(:tokenizer)
|
59
|
+
# upcaser = Wukong.registry.retrieve(:upcaser)
|
60
|
+
# downcaser = Wukong.registry.retrieve(:downcaser)
|
61
|
+
# devoweler = Wukong.registry.retrieve(:devoweler)
|
62
|
+
|
63
|
+
# Splitter = Class.new(Wukong::Dataflow)
|
64
|
+
# builder = Wukong::DataflowBuilder.receive({label: :splitter,
|
65
|
+
# for_class: Splitter,
|
66
|
+
# stages: {
|
67
|
+
# stripper: stripper,
|
68
|
+
# tokenizer: tokenizer,
|
69
|
+
# upcaser: upcaser,
|
70
|
+
# downcaser: downcaser,
|
71
|
+
# devoweler: devoweler,
|
72
|
+
# },
|
73
|
+
# links: [
|
74
|
+
# Hanuman::LinkFactory.connect(:simple, :stripper, :tokenizer),
|
75
|
+
# Hanuman::LinkFactory.connect(:simple, :tokenizer, :upcaser),
|
76
|
+
# Hanuman::LinkFactory.connect(:simple, :tokenizer, :downcaser),
|
77
|
+
# Hanuman::LinkFactory.connect(:simple, :upcaser, :devoweler),
|
78
|
+
# ]})
|
79
|
+
|
80
|
+
# builder.extract_links!
|
81
|
+
# Splitter.set_builder(builder)
|
82
|
+
# Splitter.register
|
83
|
+
|
84
|
+
Wukong.dataflow(:splitter) do
|
85
|
+
stripper | tokenizer |
|
86
|
+
[
|
87
|
+
upcaser | devoweler |
|
88
|
+
[
|
89
|
+
regexp | count,
|
90
|
+
identity
|
91
|
+
],
|
92
|
+
downcaser | reject { |word| word == 'hell' }
|
93
|
+
]
|
94
|
+
end
|
data/examples/twitter.rb
ADDED
data/lib/hanuman.rb
CHANGED
data/lib/hanuman/graph.rb
CHANGED
@@ -1,29 +1,53 @@
|
|
1
1
|
module Hanuman
|
2
|
-
class Graph < Stage
|
3
|
-
include TSort
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def tsort_each_node(&blk)
|
9
|
-
stages.keys.each(&blk)
|
3
|
+
module GraphInstanceMethods
|
4
|
+
def each_stage &block
|
5
|
+
stages.values.each(&block)
|
10
6
|
end
|
11
7
|
|
12
|
-
def
|
13
|
-
links.
|
8
|
+
def descendents stage=nil
|
9
|
+
links.find_all do |link|
|
10
|
+
stage ? link.from == stage.label : true
|
11
|
+
end.map(&:into).uniq.map { |label| stages[label] }.compact
|
14
12
|
end
|
15
13
|
|
16
|
-
def
|
14
|
+
def ancestors stage=nil
|
15
|
+
links.find_all do |link|
|
16
|
+
stage ? link.into == stage.label : true
|
17
|
+
end.map(&:from).uniq.map { |label| stages[label] }.compact
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_stage stage
|
21
|
+
stages[stage.label] = stage
|
22
|
+
end
|
23
|
+
|
24
|
+
def has_link? from, into
|
25
|
+
links.detect { |link| link.from == from.label && link.into == into.label } ? true : false
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_link type, from, into
|
29
|
+
add_stage(from)
|
30
|
+
add_stage(into)
|
31
|
+
self.links << Hanuman::LinkFactory.connect(type, from.linkable_name(:in), into.linkable_name(:out))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Graph < Stage
|
36
|
+
include GraphInstanceMethods
|
37
|
+
|
38
|
+
field :stages, Hash, :default => {}
|
39
|
+
field :links, Array, :default => []
|
17
40
|
end
|
18
41
|
|
19
42
|
class GraphBuilder < StageBuilder
|
20
|
-
include TSort
|
21
43
|
|
44
|
+
include GraphInstanceMethods
|
45
|
+
|
22
46
|
field :stages, Hash, :default => {}
|
23
47
|
field :links, Array, :default => []
|
24
48
|
|
25
49
|
def define(&blk)
|
26
|
-
graph = for_class || define_class(label)
|
50
|
+
graph = for_class || define_class(label)
|
27
51
|
self.instance_eval(&blk) if block_given?
|
28
52
|
extract_links!
|
29
53
|
graph.register
|
@@ -49,6 +73,7 @@ module Hanuman
|
|
49
73
|
end
|
50
74
|
|
51
75
|
def extract_links!
|
76
|
+
self.links.replace([])
|
52
77
|
stages.each_pair{ |name, builder| links << builder.links }
|
53
78
|
links.flatten!
|
54
79
|
end
|
@@ -60,21 +85,13 @@ module Hanuman
|
|
60
85
|
attrs.merge(args)
|
61
86
|
end
|
62
87
|
|
63
|
-
def tsort_each_node(&blk)
|
64
|
-
stages.keys.each(&blk)
|
65
|
-
end
|
66
|
-
|
67
|
-
def tsort_each_child(node, &blk)
|
68
|
-
links.select{ |link| link.into == node }.map(&:from).each(&blk)
|
69
|
-
end
|
70
|
-
|
71
|
-
def directed_sort() self.tsort ; end
|
72
|
-
|
73
88
|
def clone
|
74
89
|
cloned_attrs = Hash[ serialize.select{ |key, val| key != :stages }.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
|
75
90
|
cloned_links = links.map{ |link| link.dup }
|
76
91
|
cloned_stages = Hash[ stages.map{ |stage| stage.clone } ]
|
77
92
|
self.class.receive(cloned_attrs.merge(links: cloned_links).merge(stages: cloned_stages).merge(for_class: for_class))
|
78
93
|
end
|
94
|
+
|
79
95
|
end
|
96
|
+
|
80
97
|
end
|
data/lib/hanuman/stage.rb
CHANGED
@@ -1,4 +1,22 @@
|
|
1
1
|
module Hanuman
|
2
|
+
|
3
|
+
module StageInstanceMethods
|
4
|
+
|
5
|
+
attr_accessor :graph
|
6
|
+
|
7
|
+
def linkable_name(direction) ; self.label ; end
|
8
|
+
|
9
|
+
def add_link(type, other_stage)
|
10
|
+
self.links << Hanuman::LinkFactory.connect(type, linkable_name(:in), other_stage.linkable_name(:out))
|
11
|
+
graph.add_link(type, self, other_stage) if graph
|
12
|
+
end
|
13
|
+
|
14
|
+
def root
|
15
|
+
graph ? graph.root(self) : self
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
2
20
|
module StageClassMethods
|
3
21
|
|
4
22
|
def label() self.to_s.demodulize.underscore.to_sym ; end
|
@@ -23,13 +41,23 @@ module Hanuman
|
|
23
41
|
|
24
42
|
class Stage
|
25
43
|
include Gorillib::Model
|
44
|
+
include StageInstanceMethods
|
26
45
|
extend StageClassMethods
|
27
46
|
|
28
47
|
field :label, Symbol, :doc => false
|
29
|
-
|
48
|
+
field :links, Array, :default => [], doc: false
|
30
49
|
|
50
|
+
def clone
|
51
|
+
cloned_attrs = Hash[ attributes.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
|
52
|
+
cloned_links = links.map{ |link| link.dup }
|
53
|
+
self.class.receive(cloned_attrs.merge(links: cloned_links))
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
31
57
|
class StageBuilder
|
58
|
+
|
32
59
|
include Gorillib::Model
|
60
|
+
include StageInstanceMethods
|
33
61
|
|
34
62
|
field :args, Hash, :default => {}
|
35
63
|
field :for_class, Class
|
@@ -65,18 +93,6 @@ module Hanuman
|
|
65
93
|
klass
|
66
94
|
end
|
67
95
|
|
68
|
-
def linkable_name(direction) self.label ; end
|
69
|
-
|
70
|
-
def add_link(level, from, into)
|
71
|
-
links << Hanuman::LinkFactory.connect(level, from, into)
|
72
|
-
end
|
73
|
-
|
74
|
-
def into(other_stage)
|
75
|
-
self.add_link(:simple, self.linkable_name(:in), other_stage.linkable_name(:out))
|
76
|
-
other_stage
|
77
|
-
end
|
78
|
-
alias_method :|, :into
|
79
|
-
|
80
96
|
def serialize()
|
81
97
|
attrs = attributes
|
82
98
|
args = attrs.delete(:args)
|
@@ -91,5 +107,22 @@ module Hanuman
|
|
91
107
|
self.class.receive(cloned_attrs.merge(links: cloned_links).merge(for_class: for_class))
|
92
108
|
end
|
93
109
|
|
110
|
+
def into(stage_or_stages)
|
111
|
+
return stage_or_stages if stage_or_stages.nil?
|
112
|
+
if stage_or_stages.is_a?(Array)
|
113
|
+
stage_or_stages.each do |other_stage_or_stages|
|
114
|
+
while other_stage_or_stages.is_a?(Array)
|
115
|
+
other_stage_or_stages = other_stage_or_stages.first
|
116
|
+
end
|
117
|
+
other_stage = other_stage_or_stages
|
118
|
+
self.into(other_stage)
|
119
|
+
end
|
120
|
+
else
|
121
|
+
self.add_link(:simple, stage_or_stages.root)
|
122
|
+
end
|
123
|
+
stage_or_stages
|
124
|
+
end
|
125
|
+
alias_method :|, :into
|
126
|
+
|
94
127
|
end
|
95
128
|
end
|
data/lib/hanuman/tree.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'tsort'
|
2
|
+
module Hanuman
|
3
|
+
|
4
|
+
module TreeInstanceMethods
|
5
|
+
include TSort
|
6
|
+
|
7
|
+
MultipleRoots = Class.new(TSort::Cyclic)
|
8
|
+
|
9
|
+
def tsort_each_node(&blk)
|
10
|
+
stages.keys.each(&blk)
|
11
|
+
end
|
12
|
+
|
13
|
+
def tsort_each_child(label, &blk)
|
14
|
+
links.select { |link| link.into == label }.map(&:from).each(&blk)
|
15
|
+
end
|
16
|
+
|
17
|
+
def directed_sort() self.tsort ; end
|
18
|
+
|
19
|
+
def each_stage &blk
|
20
|
+
directed_sort.map { |label| stages[label]}.compact.each(&blk)
|
21
|
+
end
|
22
|
+
|
23
|
+
def root stage=nil
|
24
|
+
return stages[directed_sort.first] unless stage
|
25
|
+
return stage unless ancestor(stage)
|
26
|
+
self.root(ancestor(stage))
|
27
|
+
end
|
28
|
+
|
29
|
+
def ancestor(stage)
|
30
|
+
ancestors(stage).first
|
31
|
+
end
|
32
|
+
|
33
|
+
def leaves
|
34
|
+
the_leaves = (descendents - ancestors)
|
35
|
+
the_leaves.empty? ? [root] : the_leaves
|
36
|
+
end
|
37
|
+
|
38
|
+
def add_link type, from, into
|
39
|
+
return if has_link?(from, into)
|
40
|
+
raise TSort::Cyclic.new("Cannot link from a stage <#{from.label}> to itself") if into == from
|
41
|
+
raise MultipleRoots.new("Cannot link from <#{from.label}> to <#{into.label}> because <#{into.label}> aleady has an ancestor <#{ancestor(into).label}>") if ancestor(into)
|
42
|
+
raise TSort::Cyclic.new("Cannot link from leaf <#{from.label}> to the root <#{into.label}>") if into == root && leaves.include?(from)
|
43
|
+
super(type, from, into)
|
44
|
+
end
|
45
|
+
|
46
|
+
def prepend stage
|
47
|
+
add_link(:simple, stage, root)
|
48
|
+
end
|
49
|
+
|
50
|
+
def append stage
|
51
|
+
leaves.each do |leaf|
|
52
|
+
stage_for_leaf = stage.clone
|
53
|
+
stage_for_leaf.label = "#{stage_for_leaf.label}_for_#{leaf.label}".to_sym
|
54
|
+
add_link(:simple, leaf, stage_for_leaf)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
class Tree < Graph
|
61
|
+
include TreeInstanceMethods
|
62
|
+
end
|
63
|
+
|
64
|
+
class TreeBuilder < GraphBuilder
|
65
|
+
include TreeInstanceMethods
|
66
|
+
end
|
67
|
+
end
|