wukong 3.0.1 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +1 -1
- data/README.md +253 -45
- data/bin/wu +34 -0
- data/bin/wu-source +5 -0
- data/examples/Gemfile +0 -1
- data/examples/deploy_pack/Gemfile +0 -1
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/ruby_project/Gemfile +0 -1
- data/examples/splitter.rb +94 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +1 -1
- data/lib/hanuman/graph.rb +39 -22
- data/lib/hanuman/stage.rb +46 -13
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wukong.rb +6 -1
- data/lib/wukong/dataflow.rb +19 -48
- data/lib/wukong/driver.rb +176 -65
- data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +6 -4
- data/lib/wukong/local/runner.rb +14 -16
- data/lib/wukong/local/stdio_driver.rb +72 -12
- data/lib/wukong/processor.rb +1 -30
- data/lib/wukong/runner.rb +2 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
- data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
- data/lib/wukong/version.rb +1 -1
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
- data/lib/wukong/widget/filters.rb +15 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +2 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/serializers.rb +21 -6
- data/lib/wukong/widgets.rb +6 -3
- data/spec/hanuman/graph_spec.rb +73 -10
- data/spec/hanuman/stage_spec.rb +15 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec_helper.rb +13 -1
- data/spec/support/example_test_helpers.rb +0 -1
- data/spec/support/model_test_helpers.rb +1 -1
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_examples_for_builders.rb +8 -15
- data/spec/wukong/driver_spec.rb +152 -0
- data/spec/wukong/local/runner_spec.rb +1 -12
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/processor_spec.rb +0 -1
- data/spec/wukong/runner_spec.rb +2 -2
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/wukong.gemspec +1 -2
- metadata +45 -28
- data/lib/wukong/local/tcp_driver.rb +0 -47
- data/spec/wu/geo/geolocated_spec.rb +0 -247
- data/spec/wukong/widget/processors_spec.rb +0 -125
data/bin/wu-source
ADDED
data/examples/Gemfile
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
#
|
3
|
+
# Stupidly simple class for holding a twitter user's tweet summary
|
4
|
+
#
|
5
|
+
class UserTweetSummary
|
6
|
+
attr_accessor :fields
|
7
|
+
|
8
|
+
def initialize fields
|
9
|
+
@fields = fields
|
10
|
+
end
|
11
|
+
|
12
|
+
def update tweets
|
13
|
+
deletes_this_batch = 0
|
14
|
+
tweets_this_batch = 0
|
15
|
+
tweets.each do |tweet|
|
16
|
+
deletes_this_batch += (tweet.is_deleted ? 1 : 0)
|
17
|
+
tweets_this_batch += 1
|
18
|
+
end
|
19
|
+
fields['deletes'] += deletes_this_batch
|
20
|
+
fields['tweets'] += (tweets_this_batch - deletes_this_batch)
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_json *args
|
25
|
+
fields.to_json
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Holds a tweet and whether
|
31
|
+
# or not it's been deleted
|
32
|
+
#
|
33
|
+
class Tweet
|
34
|
+
attr_accessor :is_deleted
|
35
|
+
def initialize hash
|
36
|
+
@is_deleted = hash.keys.include?("delete")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
#
|
42
|
+
# Improver processor
|
43
|
+
#
|
44
|
+
class TweetSummarizer < Wukong::Processor::Improver
|
45
|
+
attr_accessor :user_id
|
46
|
+
|
47
|
+
def zero
|
48
|
+
super
|
49
|
+
{
|
50
|
+
'tweets' => 0,
|
51
|
+
'deletes' => 0
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def accumulate record
|
56
|
+
@user_id = record[0]
|
57
|
+
json = record[1]
|
58
|
+
self.group << Tweet.new(JSON.parse(json))
|
59
|
+
end
|
60
|
+
|
61
|
+
def improve summary, deltas
|
62
|
+
UserTweetSummary.new(JSON.parse(summary)).update(deltas)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Is this necessary?
|
68
|
+
#
|
69
|
+
TweetSummarizer.register(:tweet_summarizer)
|
70
|
+
|
71
|
+
Wukong.dataflow(:summarize_tweets) do
|
72
|
+
tweet_summarizer | to_json
|
73
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'wukong'
|
2
|
+
|
3
|
+
module Verbose
|
4
|
+
|
5
|
+
def verbose?
|
6
|
+
end
|
7
|
+
|
8
|
+
def setup
|
9
|
+
# log.info("Setting up #{label}")
|
10
|
+
end
|
11
|
+
|
12
|
+
def finalize
|
13
|
+
# log.info("Finalizing #{label}")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
Wukong.processor(:upcaser) do
|
18
|
+
include Verbose
|
19
|
+
def process(string)
|
20
|
+
# log.info("#process #{string}")
|
21
|
+
yield string.upcase
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Wukong.processor(:downcaser) do
|
26
|
+
include Verbose
|
27
|
+
def process(string)
|
28
|
+
# log.info("#process #{string}")
|
29
|
+
yield string.downcase
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Wukong.processor(:tokenizer) do
|
34
|
+
include Verbose
|
35
|
+
def process string
|
36
|
+
# log.info("#process #{string}")
|
37
|
+
string.split.each { |token| yield token }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Wukong.processor(:stripper) do
|
42
|
+
include Verbose
|
43
|
+
def process(string)
|
44
|
+
# log.info("#process #{string}")
|
45
|
+
yield string.gsub(/[^\w\s]/,'')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
Wukong.processor(:devoweler) do
|
50
|
+
include Verbose
|
51
|
+
def process(string)
|
52
|
+
# log.info("#process #{string}")
|
53
|
+
yield string.gsub(/[aeiou]/i,'')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# stripper = Wukong.registry.retrieve(:stripper)
|
58
|
+
# tokenizer = Wukong.registry.retrieve(:tokenizer)
|
59
|
+
# upcaser = Wukong.registry.retrieve(:upcaser)
|
60
|
+
# downcaser = Wukong.registry.retrieve(:downcaser)
|
61
|
+
# devoweler = Wukong.registry.retrieve(:devoweler)
|
62
|
+
|
63
|
+
# Splitter = Class.new(Wukong::Dataflow)
|
64
|
+
# builder = Wukong::DataflowBuilder.receive({label: :splitter,
|
65
|
+
# for_class: Splitter,
|
66
|
+
# stages: {
|
67
|
+
# stripper: stripper,
|
68
|
+
# tokenizer: tokenizer,
|
69
|
+
# upcaser: upcaser,
|
70
|
+
# downcaser: downcaser,
|
71
|
+
# devoweler: devoweler,
|
72
|
+
# },
|
73
|
+
# links: [
|
74
|
+
# Hanuman::LinkFactory.connect(:simple, :stripper, :tokenizer),
|
75
|
+
# Hanuman::LinkFactory.connect(:simple, :tokenizer, :upcaser),
|
76
|
+
# Hanuman::LinkFactory.connect(:simple, :tokenizer, :downcaser),
|
77
|
+
# Hanuman::LinkFactory.connect(:simple, :upcaser, :devoweler),
|
78
|
+
# ]})
|
79
|
+
|
80
|
+
# builder.extract_links!
|
81
|
+
# Splitter.set_builder(builder)
|
82
|
+
# Splitter.register
|
83
|
+
|
84
|
+
Wukong.dataflow(:splitter) do
|
85
|
+
stripper | tokenizer |
|
86
|
+
[
|
87
|
+
upcaser | devoweler |
|
88
|
+
[
|
89
|
+
regexp | count,
|
90
|
+
identity
|
91
|
+
],
|
92
|
+
downcaser | reject { |word| word == 'hell' }
|
93
|
+
]
|
94
|
+
end
|
data/examples/twitter.rb
ADDED
data/lib/hanuman.rb
CHANGED
data/lib/hanuman/graph.rb
CHANGED
@@ -1,29 +1,53 @@
|
|
1
1
|
module Hanuman
|
2
|
-
class Graph < Stage
|
3
|
-
include TSort
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def tsort_each_node(&blk)
|
9
|
-
stages.keys.each(&blk)
|
3
|
+
module GraphInstanceMethods
|
4
|
+
def each_stage &block
|
5
|
+
stages.values.each(&block)
|
10
6
|
end
|
11
7
|
|
12
|
-
def
|
13
|
-
links.
|
8
|
+
def descendents stage=nil
|
9
|
+
links.find_all do |link|
|
10
|
+
stage ? link.from == stage.label : true
|
11
|
+
end.map(&:into).uniq.map { |label| stages[label] }.compact
|
14
12
|
end
|
15
13
|
|
16
|
-
def
|
14
|
+
def ancestors stage=nil
|
15
|
+
links.find_all do |link|
|
16
|
+
stage ? link.into == stage.label : true
|
17
|
+
end.map(&:from).uniq.map { |label| stages[label] }.compact
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_stage stage
|
21
|
+
stages[stage.label] = stage
|
22
|
+
end
|
23
|
+
|
24
|
+
def has_link? from, into
|
25
|
+
links.detect { |link| link.from == from.label && link.into == into.label } ? true : false
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_link type, from, into
|
29
|
+
add_stage(from)
|
30
|
+
add_stage(into)
|
31
|
+
self.links << Hanuman::LinkFactory.connect(type, from.linkable_name(:in), into.linkable_name(:out))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Graph < Stage
|
36
|
+
include GraphInstanceMethods
|
37
|
+
|
38
|
+
field :stages, Hash, :default => {}
|
39
|
+
field :links, Array, :default => []
|
17
40
|
end
|
18
41
|
|
19
42
|
class GraphBuilder < StageBuilder
|
20
|
-
include TSort
|
21
43
|
|
44
|
+
include GraphInstanceMethods
|
45
|
+
|
22
46
|
field :stages, Hash, :default => {}
|
23
47
|
field :links, Array, :default => []
|
24
48
|
|
25
49
|
def define(&blk)
|
26
|
-
graph = for_class || define_class(label)
|
50
|
+
graph = for_class || define_class(label)
|
27
51
|
self.instance_eval(&blk) if block_given?
|
28
52
|
extract_links!
|
29
53
|
graph.register
|
@@ -49,6 +73,7 @@ module Hanuman
|
|
49
73
|
end
|
50
74
|
|
51
75
|
def extract_links!
|
76
|
+
self.links.replace([])
|
52
77
|
stages.each_pair{ |name, builder| links << builder.links }
|
53
78
|
links.flatten!
|
54
79
|
end
|
@@ -60,21 +85,13 @@ module Hanuman
|
|
60
85
|
attrs.merge(args)
|
61
86
|
end
|
62
87
|
|
63
|
-
def tsort_each_node(&blk)
|
64
|
-
stages.keys.each(&blk)
|
65
|
-
end
|
66
|
-
|
67
|
-
def tsort_each_child(node, &blk)
|
68
|
-
links.select{ |link| link.into == node }.map(&:from).each(&blk)
|
69
|
-
end
|
70
|
-
|
71
|
-
def directed_sort() self.tsort ; end
|
72
|
-
|
73
88
|
def clone
|
74
89
|
cloned_attrs = Hash[ serialize.select{ |key, val| key != :stages }.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
|
75
90
|
cloned_links = links.map{ |link| link.dup }
|
76
91
|
cloned_stages = Hash[ stages.map{ |stage| stage.clone } ]
|
77
92
|
self.class.receive(cloned_attrs.merge(links: cloned_links).merge(stages: cloned_stages).merge(for_class: for_class))
|
78
93
|
end
|
94
|
+
|
79
95
|
end
|
96
|
+
|
80
97
|
end
|
data/lib/hanuman/stage.rb
CHANGED
@@ -1,4 +1,22 @@
|
|
1
1
|
module Hanuman
|
2
|
+
|
3
|
+
module StageInstanceMethods
|
4
|
+
|
5
|
+
attr_accessor :graph
|
6
|
+
|
7
|
+
def linkable_name(direction) ; self.label ; end
|
8
|
+
|
9
|
+
def add_link(type, other_stage)
|
10
|
+
self.links << Hanuman::LinkFactory.connect(type, linkable_name(:in), other_stage.linkable_name(:out))
|
11
|
+
graph.add_link(type, self, other_stage) if graph
|
12
|
+
end
|
13
|
+
|
14
|
+
def root
|
15
|
+
graph ? graph.root(self) : self
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
2
20
|
module StageClassMethods
|
3
21
|
|
4
22
|
def label() self.to_s.demodulize.underscore.to_sym ; end
|
@@ -23,13 +41,23 @@ module Hanuman
|
|
23
41
|
|
24
42
|
class Stage
|
25
43
|
include Gorillib::Model
|
44
|
+
include StageInstanceMethods
|
26
45
|
extend StageClassMethods
|
27
46
|
|
28
47
|
field :label, Symbol, :doc => false
|
29
|
-
|
48
|
+
field :links, Array, :default => [], doc: false
|
30
49
|
|
50
|
+
def clone
|
51
|
+
cloned_attrs = Hash[ attributes.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
|
52
|
+
cloned_links = links.map{ |link| link.dup }
|
53
|
+
self.class.receive(cloned_attrs.merge(links: cloned_links))
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
31
57
|
class StageBuilder
|
58
|
+
|
32
59
|
include Gorillib::Model
|
60
|
+
include StageInstanceMethods
|
33
61
|
|
34
62
|
field :args, Hash, :default => {}
|
35
63
|
field :for_class, Class
|
@@ -65,18 +93,6 @@ module Hanuman
|
|
65
93
|
klass
|
66
94
|
end
|
67
95
|
|
68
|
-
def linkable_name(direction) self.label ; end
|
69
|
-
|
70
|
-
def add_link(level, from, into)
|
71
|
-
links << Hanuman::LinkFactory.connect(level, from, into)
|
72
|
-
end
|
73
|
-
|
74
|
-
def into(other_stage)
|
75
|
-
self.add_link(:simple, self.linkable_name(:in), other_stage.linkable_name(:out))
|
76
|
-
other_stage
|
77
|
-
end
|
78
|
-
alias_method :|, :into
|
79
|
-
|
80
96
|
def serialize()
|
81
97
|
attrs = attributes
|
82
98
|
args = attrs.delete(:args)
|
@@ -91,5 +107,22 @@ module Hanuman
|
|
91
107
|
self.class.receive(cloned_attrs.merge(links: cloned_links).merge(for_class: for_class))
|
92
108
|
end
|
93
109
|
|
110
|
+
def into(stage_or_stages)
|
111
|
+
return stage_or_stages if stage_or_stages.nil?
|
112
|
+
if stage_or_stages.is_a?(Array)
|
113
|
+
stage_or_stages.each do |other_stage_or_stages|
|
114
|
+
while other_stage_or_stages.is_a?(Array)
|
115
|
+
other_stage_or_stages = other_stage_or_stages.first
|
116
|
+
end
|
117
|
+
other_stage = other_stage_or_stages
|
118
|
+
self.into(other_stage)
|
119
|
+
end
|
120
|
+
else
|
121
|
+
self.add_link(:simple, stage_or_stages.root)
|
122
|
+
end
|
123
|
+
stage_or_stages
|
124
|
+
end
|
125
|
+
alias_method :|, :into
|
126
|
+
|
94
127
|
end
|
95
128
|
end
|
data/lib/hanuman/tree.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'tsort'
|
2
|
+
module Hanuman
|
3
|
+
|
4
|
+
module TreeInstanceMethods
|
5
|
+
include TSort
|
6
|
+
|
7
|
+
MultipleRoots = Class.new(TSort::Cyclic)
|
8
|
+
|
9
|
+
def tsort_each_node(&blk)
|
10
|
+
stages.keys.each(&blk)
|
11
|
+
end
|
12
|
+
|
13
|
+
def tsort_each_child(label, &blk)
|
14
|
+
links.select { |link| link.into == label }.map(&:from).each(&blk)
|
15
|
+
end
|
16
|
+
|
17
|
+
def directed_sort() self.tsort ; end
|
18
|
+
|
19
|
+
def each_stage &blk
|
20
|
+
directed_sort.map { |label| stages[label]}.compact.each(&blk)
|
21
|
+
end
|
22
|
+
|
23
|
+
def root stage=nil
|
24
|
+
return stages[directed_sort.first] unless stage
|
25
|
+
return stage unless ancestor(stage)
|
26
|
+
self.root(ancestor(stage))
|
27
|
+
end
|
28
|
+
|
29
|
+
def ancestor(stage)
|
30
|
+
ancestors(stage).first
|
31
|
+
end
|
32
|
+
|
33
|
+
def leaves
|
34
|
+
the_leaves = (descendents - ancestors)
|
35
|
+
the_leaves.empty? ? [root] : the_leaves
|
36
|
+
end
|
37
|
+
|
38
|
+
def add_link type, from, into
|
39
|
+
return if has_link?(from, into)
|
40
|
+
raise TSort::Cyclic.new("Cannot link from a stage <#{from.label}> to itself") if into == from
|
41
|
+
raise MultipleRoots.new("Cannot link from <#{from.label}> to <#{into.label}> because <#{into.label}> aleady has an ancestor <#{ancestor(into).label}>") if ancestor(into)
|
42
|
+
raise TSort::Cyclic.new("Cannot link from leaf <#{from.label}> to the root <#{into.label}>") if into == root && leaves.include?(from)
|
43
|
+
super(type, from, into)
|
44
|
+
end
|
45
|
+
|
46
|
+
def prepend stage
|
47
|
+
add_link(:simple, stage, root)
|
48
|
+
end
|
49
|
+
|
50
|
+
def append stage
|
51
|
+
leaves.each do |leaf|
|
52
|
+
stage_for_leaf = stage.clone
|
53
|
+
stage_for_leaf.label = "#{stage_for_leaf.label}_for_#{leaf.label}".to_sym
|
54
|
+
add_link(:simple, leaf, stage_for_leaf)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
class Tree < Graph
|
61
|
+
include TreeInstanceMethods
|
62
|
+
end
|
63
|
+
|
64
|
+
class TreeBuilder < GraphBuilder
|
65
|
+
include TreeInstanceMethods
|
66
|
+
end
|
67
|
+
end
|