wukong 3.0.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wukong'
4
+ require 'wukong/source'
5
+ Wukong::Source::SourceRunner.run
@@ -2,7 +2,6 @@ source :rubygems
2
2
 
3
3
  gem "configliere", '~> 0.4'
4
4
  gem "multi_json", '>= 1.3.6'
5
- gem "vayacondios-client", '>= 0.0.3'
6
5
  gem "gorillib", '>= 0.4.2'
7
6
  gem "uuidtools"
8
7
  gem "eventmachine"
@@ -2,6 +2,5 @@
2
2
  gem 'configliere'
3
3
  gem 'gorillib'
4
4
  gem 'multi_json'
5
- gem 'vayacondios-client'
6
5
  gem 'log4r'
7
6
  gem 'eventmachine'
@@ -0,0 +1,73 @@
1
+
2
+ #
3
+ # Stupidly simple class for holding a twitter user's tweet summary
4
+ #
5
+ class UserTweetSummary
6
+ attr_accessor :fields
7
+
8
+ def initialize fields
9
+ @fields = fields
10
+ end
11
+
12
+ def update tweets
13
+ deletes_this_batch = 0
14
+ tweets_this_batch = 0
15
+ tweets.each do |tweet|
16
+ deletes_this_batch += (tweet.is_deleted ? 1 : 0)
17
+ tweets_this_batch += 1
18
+ end
19
+ fields['deletes'] += deletes_this_batch
20
+ fields['tweets'] += (tweets_this_batch - deletes_this_batch)
21
+ self
22
+ end
23
+
24
+ def to_json *args
25
+ fields.to_json
26
+ end
27
+ end
28
+
29
+ #
30
+ # Holds a tweet and whether
31
+ # or not it's been deleted
32
+ #
33
+ class Tweet
34
+ attr_accessor :is_deleted
35
+ def initialize hash
36
+ @is_deleted = hash.keys.include?("delete")
37
+ end
38
+ end
39
+
40
+
41
+ #
42
+ # Improver processor
43
+ #
44
+ class TweetSummarizer < Wukong::Processor::Improver
45
+ attr_accessor :user_id
46
+
47
+ def zero
48
+ super
49
+ {
50
+ 'tweets' => 0,
51
+ 'deletes' => 0
52
+ }
53
+ end
54
+
55
+ def accumulate record
56
+ @user_id = record[0]
57
+ json = record[1]
58
+ self.group << Tweet.new(JSON.parse(json))
59
+ end
60
+
61
+ def improve summary, deltas
62
+ UserTweetSummary.new(JSON.parse(summary)).update(deltas)
63
+ end
64
+ end
65
+
66
+ #
67
+ # Is this necessary?
68
+ #
69
+ TweetSummarizer.register(:tweet_summarizer)
70
+
71
+ Wukong.dataflow(:summarize_tweets) do
72
+ tweet_summarizer | to_json
73
+ end
@@ -2,6 +2,5 @@
2
2
  gem 'configliere'
3
3
  gem 'gorillib'
4
4
  gem 'multi_json'
5
- gem 'vayacondios-client'
6
5
  gem 'log4r'
7
6
  gem 'eventmachine'
@@ -0,0 +1,94 @@
1
+ require 'wukong'
2
+
3
+ module Verbose
4
+
5
+ def verbose?
6
+ end
7
+
8
+ def setup
9
+ # log.info("Setting up #{label}")
10
+ end
11
+
12
+ def finalize
13
+ # log.info("Finalizing #{label}")
14
+ end
15
+ end
16
+
17
+ Wukong.processor(:upcaser) do
18
+ include Verbose
19
+ def process(string)
20
+ # log.info("#process #{string}")
21
+ yield string.upcase
22
+ end
23
+ end
24
+
25
+ Wukong.processor(:downcaser) do
26
+ include Verbose
27
+ def process(string)
28
+ # log.info("#process #{string}")
29
+ yield string.downcase
30
+ end
31
+ end
32
+
33
+ Wukong.processor(:tokenizer) do
34
+ include Verbose
35
+ def process string
36
+ # log.info("#process #{string}")
37
+ string.split.each { |token| yield token }
38
+ end
39
+ end
40
+
41
+ Wukong.processor(:stripper) do
42
+ include Verbose
43
+ def process(string)
44
+ # log.info("#process #{string}")
45
+ yield string.gsub(/[^\w\s]/,'')
46
+ end
47
+ end
48
+
49
+ Wukong.processor(:devoweler) do
50
+ include Verbose
51
+ def process(string)
52
+ # log.info("#process #{string}")
53
+ yield string.gsub(/[aeiou]/i,'')
54
+ end
55
+ end
56
+
57
+ # stripper = Wukong.registry.retrieve(:stripper)
58
+ # tokenizer = Wukong.registry.retrieve(:tokenizer)
59
+ # upcaser = Wukong.registry.retrieve(:upcaser)
60
+ # downcaser = Wukong.registry.retrieve(:downcaser)
61
+ # devoweler = Wukong.registry.retrieve(:devoweler)
62
+
63
+ # Splitter = Class.new(Wukong::Dataflow)
64
+ # builder = Wukong::DataflowBuilder.receive({label: :splitter,
65
+ # for_class: Splitter,
66
+ # stages: {
67
+ # stripper: stripper,
68
+ # tokenizer: tokenizer,
69
+ # upcaser: upcaser,
70
+ # downcaser: downcaser,
71
+ # devoweler: devoweler,
72
+ # },
73
+ # links: [
74
+ # Hanuman::LinkFactory.connect(:simple, :stripper, :tokenizer),
75
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :upcaser),
76
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :downcaser),
77
+ # Hanuman::LinkFactory.connect(:simple, :upcaser, :devoweler),
78
+ # ]})
79
+
80
+ # builder.extract_links!
81
+ # Splitter.set_builder(builder)
82
+ # Splitter.register
83
+
84
+ Wukong.dataflow(:splitter) do
85
+ stripper | tokenizer |
86
+ [
87
+ upcaser | devoweler |
88
+ [
89
+ regexp | count,
90
+ identity
91
+ ],
92
+ downcaser | reject { |word| word == 'hell' }
93
+ ]
94
+ end
@@ -0,0 +1,5 @@
1
+ Wukong.dataflow(:twitter) do
2
+ from_json | reject { |obj| obj["delete"] } |
3
+ [
4
+
5
+ end
@@ -1,11 +1,11 @@
1
1
  require 'gorillib/some'
2
2
  require 'gorillib/model'
3
- require 'tsort'
4
3
 
5
4
  require 'hanuman/registry'
6
5
  require 'hanuman/link'
7
6
  require 'hanuman/stage'
8
7
  require 'hanuman/graph'
8
+ require 'hanuman/tree'
9
9
 
10
10
  module Hanuman
11
11
  module Shortcuts
@@ -1,29 +1,53 @@
1
1
  module Hanuman
2
- class Graph < Stage
3
- include TSort
4
2
 
5
- field :stages, Hash, :default => {}
6
- field :links, Array, :default => []
7
-
8
- def tsort_each_node(&blk)
9
- stages.keys.each(&blk)
3
+ module GraphInstanceMethods
4
+ def each_stage &block
5
+ stages.values.each(&block)
10
6
  end
11
7
 
12
- def tsort_each_child(node, &blk)
13
- links.select{ |link| link.into == node }.map(&:from).each(&blk)
8
+ def descendents stage=nil
9
+ links.find_all do |link|
10
+ stage ? link.from == stage.label : true
11
+ end.map(&:into).uniq.map { |label| stages[label] }.compact
14
12
  end
15
13
 
16
- def directed_sort() self.tsort ; end
14
+ def ancestors stage=nil
15
+ links.find_all do |link|
16
+ stage ? link.into == stage.label : true
17
+ end.map(&:from).uniq.map { |label| stages[label] }.compact
18
+ end
19
+
20
+ def add_stage stage
21
+ stages[stage.label] = stage
22
+ end
23
+
24
+ def has_link? from, into
25
+ links.detect { |link| link.from == from.label && link.into == into.label } ? true : false
26
+ end
27
+
28
+ def add_link type, from, into
29
+ add_stage(from)
30
+ add_stage(into)
31
+ self.links << Hanuman::LinkFactory.connect(type, from.linkable_name(:in), into.linkable_name(:out))
32
+ end
33
+ end
34
+
35
+ class Graph < Stage
36
+ include GraphInstanceMethods
37
+
38
+ field :stages, Hash, :default => {}
39
+ field :links, Array, :default => []
17
40
  end
18
41
 
19
42
  class GraphBuilder < StageBuilder
20
- include TSort
21
43
 
44
+ include GraphInstanceMethods
45
+
22
46
  field :stages, Hash, :default => {}
23
47
  field :links, Array, :default => []
24
48
 
25
49
  def define(&blk)
26
- graph = for_class || define_class(label)
50
+ graph = for_class || define_class(label)
27
51
  self.instance_eval(&blk) if block_given?
28
52
  extract_links!
29
53
  graph.register
@@ -49,6 +73,7 @@ module Hanuman
49
73
  end
50
74
 
51
75
  def extract_links!
76
+ self.links.replace([])
52
77
  stages.each_pair{ |name, builder| links << builder.links }
53
78
  links.flatten!
54
79
  end
@@ -60,21 +85,13 @@ module Hanuman
60
85
  attrs.merge(args)
61
86
  end
62
87
 
63
- def tsort_each_node(&blk)
64
- stages.keys.each(&blk)
65
- end
66
-
67
- def tsort_each_child(node, &blk)
68
- links.select{ |link| link.into == node }.map(&:from).each(&blk)
69
- end
70
-
71
- def directed_sort() self.tsort ; end
72
-
73
88
  def clone
74
89
  cloned_attrs = Hash[ serialize.select{ |key, val| key != :stages }.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
75
90
  cloned_links = links.map{ |link| link.dup }
76
91
  cloned_stages = Hash[ stages.map{ |stage| stage.clone } ]
77
92
  self.class.receive(cloned_attrs.merge(links: cloned_links).merge(stages: cloned_stages).merge(for_class: for_class))
78
93
  end
94
+
79
95
  end
96
+
80
97
  end
@@ -1,4 +1,22 @@
1
1
  module Hanuman
2
+
3
+ module StageInstanceMethods
4
+
5
+ attr_accessor :graph
6
+
7
+ def linkable_name(direction) ; self.label ; end
8
+
9
+ def add_link(type, other_stage)
10
+ self.links << Hanuman::LinkFactory.connect(type, linkable_name(:in), other_stage.linkable_name(:out))
11
+ graph.add_link(type, self, other_stage) if graph
12
+ end
13
+
14
+ def root
15
+ graph ? graph.root(self) : self
16
+ end
17
+
18
+ end
19
+
2
20
  module StageClassMethods
3
21
 
4
22
  def label() self.to_s.demodulize.underscore.to_sym ; end
@@ -23,13 +41,23 @@ module Hanuman
23
41
 
24
42
  class Stage
25
43
  include Gorillib::Model
44
+ include StageInstanceMethods
26
45
  extend StageClassMethods
27
46
 
28
47
  field :label, Symbol, :doc => false
29
- end
48
+ field :links, Array, :default => [], doc: false
30
49
 
50
+ def clone
51
+ cloned_attrs = Hash[ attributes.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
52
+ cloned_links = links.map{ |link| link.dup }
53
+ self.class.receive(cloned_attrs.merge(links: cloned_links))
54
+ end
55
+ end
56
+
31
57
  class StageBuilder
58
+
32
59
  include Gorillib::Model
60
+ include StageInstanceMethods
33
61
 
34
62
  field :args, Hash, :default => {}
35
63
  field :for_class, Class
@@ -65,18 +93,6 @@ module Hanuman
65
93
  klass
66
94
  end
67
95
 
68
- def linkable_name(direction) self.label ; end
69
-
70
- def add_link(level, from, into)
71
- links << Hanuman::LinkFactory.connect(level, from, into)
72
- end
73
-
74
- def into(other_stage)
75
- self.add_link(:simple, self.linkable_name(:in), other_stage.linkable_name(:out))
76
- other_stage
77
- end
78
- alias_method :|, :into
79
-
80
96
  def serialize()
81
97
  attrs = attributes
82
98
  args = attrs.delete(:args)
@@ -91,5 +107,22 @@ module Hanuman
91
107
  self.class.receive(cloned_attrs.merge(links: cloned_links).merge(for_class: for_class))
92
108
  end
93
109
 
110
+ def into(stage_or_stages)
111
+ return stage_or_stages if stage_or_stages.nil?
112
+ if stage_or_stages.is_a?(Array)
113
+ stage_or_stages.each do |other_stage_or_stages|
114
+ while other_stage_or_stages.is_a?(Array)
115
+ other_stage_or_stages = other_stage_or_stages.first
116
+ end
117
+ other_stage = other_stage_or_stages
118
+ self.into(other_stage)
119
+ end
120
+ else
121
+ self.add_link(:simple, stage_or_stages.root)
122
+ end
123
+ stage_or_stages
124
+ end
125
+ alias_method :|, :into
126
+
94
127
  end
95
128
  end
@@ -0,0 +1,67 @@
1
+ require 'tsort'
2
+ module Hanuman
3
+
4
+ module TreeInstanceMethods
5
+ include TSort
6
+
7
+ MultipleRoots = Class.new(TSort::Cyclic)
8
+
9
+ def tsort_each_node(&blk)
10
+ stages.keys.each(&blk)
11
+ end
12
+
13
+ def tsort_each_child(label, &blk)
14
+ links.select { |link| link.into == label }.map(&:from).each(&blk)
15
+ end
16
+
17
+ def directed_sort() self.tsort ; end
18
+
19
+ def each_stage &blk
20
+ directed_sort.map { |label| stages[label]}.compact.each(&blk)
21
+ end
22
+
23
+ def root stage=nil
24
+ return stages[directed_sort.first] unless stage
25
+ return stage unless ancestor(stage)
26
+ self.root(ancestor(stage))
27
+ end
28
+
29
+ def ancestor(stage)
30
+ ancestors(stage).first
31
+ end
32
+
33
+ def leaves
34
+ the_leaves = (descendents - ancestors)
35
+ the_leaves.empty? ? [root] : the_leaves
36
+ end
37
+
38
+ def add_link type, from, into
39
+ return if has_link?(from, into)
40
+ raise TSort::Cyclic.new("Cannot link from a stage <#{from.label}> to itself") if into == from
41
+ raise MultipleRoots.new("Cannot link from <#{from.label}> to <#{into.label}> because <#{into.label}> aleady has an ancestor <#{ancestor(into).label}>") if ancestor(into)
42
+ raise TSort::Cyclic.new("Cannot link from leaf <#{from.label}> to the root <#{into.label}>") if into == root && leaves.include?(from)
43
+ super(type, from, into)
44
+ end
45
+
46
+ def prepend stage
47
+ add_link(:simple, stage, root)
48
+ end
49
+
50
+ def append stage
51
+ leaves.each do |leaf|
52
+ stage_for_leaf = stage.clone
53
+ stage_for_leaf.label = "#{stage_for_leaf.label}_for_#{leaf.label}".to_sym
54
+ add_link(:simple, leaf, stage_for_leaf)
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ class Tree < Graph
61
+ include TreeInstanceMethods
62
+ end
63
+
64
+ class TreeBuilder < GraphBuilder
65
+ include TreeInstanceMethods
66
+ end
67
+ end