wukong 3.0.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wukong'
4
+ require 'wukong/source'
5
+ Wukong::Source::SourceRunner.run
@@ -2,7 +2,6 @@ source :rubygems
2
2
 
3
3
  gem "configliere", '~> 0.4'
4
4
  gem "multi_json", '>= 1.3.6'
5
- gem "vayacondios-client", '>= 0.0.3'
6
5
  gem "gorillib", '>= 0.4.2'
7
6
  gem "uuidtools"
8
7
  gem "eventmachine"
@@ -2,6 +2,5 @@
2
2
  gem 'configliere'
3
3
  gem 'gorillib'
4
4
  gem 'multi_json'
5
- gem 'vayacondios-client'
6
5
  gem 'log4r'
7
6
  gem 'eventmachine'
@@ -0,0 +1,73 @@
1
+
2
+ #
3
+ # Stupidly simple class for holding a twitter user's tweet summary
4
+ #
5
+ class UserTweetSummary
6
+ attr_accessor :fields
7
+
8
+ def initialize fields
9
+ @fields = fields
10
+ end
11
+
12
+ def update tweets
13
+ deletes_this_batch = 0
14
+ tweets_this_batch = 0
15
+ tweets.each do |tweet|
16
+ deletes_this_batch += (tweet.is_deleted ? 1 : 0)
17
+ tweets_this_batch += 1
18
+ end
19
+ fields['deletes'] += deletes_this_batch
20
+ fields['tweets'] += (tweets_this_batch - deletes_this_batch)
21
+ self
22
+ end
23
+
24
+ def to_json *args
25
+ fields.to_json
26
+ end
27
+ end
28
+
29
+ #
30
+ # Holds a tweet and whether
31
+ # or not it's been deleted
32
+ #
33
+ class Tweet
34
+ attr_accessor :is_deleted
35
+ def initialize hash
36
+ @is_deleted = hash.keys.include?("delete")
37
+ end
38
+ end
39
+
40
+
41
+ #
42
+ # Improver processor
43
+ #
44
+ class TweetSummarizer < Wukong::Processor::Improver
45
+ attr_accessor :user_id
46
+
47
+ def zero
48
+ super
49
+ {
50
+ 'tweets' => 0,
51
+ 'deletes' => 0
52
+ }
53
+ end
54
+
55
+ def accumulate record
56
+ @user_id = record[0]
57
+ json = record[1]
58
+ self.group << Tweet.new(JSON.parse(json))
59
+ end
60
+
61
+ def improve summary, deltas
62
+ UserTweetSummary.new(JSON.parse(summary)).update(deltas)
63
+ end
64
+ end
65
+
66
+ #
67
+ # Is this necessary?
68
+ #
69
+ TweetSummarizer.register(:tweet_summarizer)
70
+
71
+ Wukong.dataflow(:summarize_tweets) do
72
+ tweet_summarizer | to_json
73
+ end
@@ -2,6 +2,5 @@
2
2
  gem 'configliere'
3
3
  gem 'gorillib'
4
4
  gem 'multi_json'
5
- gem 'vayacondios-client'
6
5
  gem 'log4r'
7
6
  gem 'eventmachine'
@@ -0,0 +1,94 @@
1
+ require 'wukong'
2
+
3
+ module Verbose
4
+
5
+ def verbose?
6
+ end
7
+
8
+ def setup
9
+ # log.info("Setting up #{label}")
10
+ end
11
+
12
+ def finalize
13
+ # log.info("Finalizing #{label}")
14
+ end
15
+ end
16
+
17
+ Wukong.processor(:upcaser) do
18
+ include Verbose
19
+ def process(string)
20
+ # log.info("#process #{string}")
21
+ yield string.upcase
22
+ end
23
+ end
24
+
25
+ Wukong.processor(:downcaser) do
26
+ include Verbose
27
+ def process(string)
28
+ # log.info("#process #{string}")
29
+ yield string.downcase
30
+ end
31
+ end
32
+
33
+ Wukong.processor(:tokenizer) do
34
+ include Verbose
35
+ def process string
36
+ # log.info("#process #{string}")
37
+ string.split.each { |token| yield token }
38
+ end
39
+ end
40
+
41
+ Wukong.processor(:stripper) do
42
+ include Verbose
43
+ def process(string)
44
+ # log.info("#process #{string}")
45
+ yield string.gsub(/[^\w\s]/,'')
46
+ end
47
+ end
48
+
49
+ Wukong.processor(:devoweler) do
50
+ include Verbose
51
+ def process(string)
52
+ # log.info("#process #{string}")
53
+ yield string.gsub(/[aeiou]/i,'')
54
+ end
55
+ end
56
+
57
+ # stripper = Wukong.registry.retrieve(:stripper)
58
+ # tokenizer = Wukong.registry.retrieve(:tokenizer)
59
+ # upcaser = Wukong.registry.retrieve(:upcaser)
60
+ # downcaser = Wukong.registry.retrieve(:downcaser)
61
+ # devoweler = Wukong.registry.retrieve(:devoweler)
62
+
63
+ # Splitter = Class.new(Wukong::Dataflow)
64
+ # builder = Wukong::DataflowBuilder.receive({label: :splitter,
65
+ # for_class: Splitter,
66
+ # stages: {
67
+ # stripper: stripper,
68
+ # tokenizer: tokenizer,
69
+ # upcaser: upcaser,
70
+ # downcaser: downcaser,
71
+ # devoweler: devoweler,
72
+ # },
73
+ # links: [
74
+ # Hanuman::LinkFactory.connect(:simple, :stripper, :tokenizer),
75
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :upcaser),
76
+ # Hanuman::LinkFactory.connect(:simple, :tokenizer, :downcaser),
77
+ # Hanuman::LinkFactory.connect(:simple, :upcaser, :devoweler),
78
+ # ]})
79
+
80
+ # builder.extract_links!
81
+ # Splitter.set_builder(builder)
82
+ # Splitter.register
83
+
84
+ Wukong.dataflow(:splitter) do
85
+ stripper | tokenizer |
86
+ [
87
+ upcaser | devoweler |
88
+ [
89
+ regexp | count,
90
+ identity
91
+ ],
92
+ downcaser | reject { |word| word == 'hell' }
93
+ ]
94
+ end
@@ -0,0 +1,5 @@
1
+ Wukong.dataflow(:twitter) do
2
+ from_json | reject { |obj| obj["delete"] } |
3
+ [
4
+
5
+ end
@@ -1,11 +1,11 @@
1
1
  require 'gorillib/some'
2
2
  require 'gorillib/model'
3
- require 'tsort'
4
3
 
5
4
  require 'hanuman/registry'
6
5
  require 'hanuman/link'
7
6
  require 'hanuman/stage'
8
7
  require 'hanuman/graph'
8
+ require 'hanuman/tree'
9
9
 
10
10
  module Hanuman
11
11
  module Shortcuts
@@ -1,29 +1,53 @@
1
1
  module Hanuman
2
- class Graph < Stage
3
- include TSort
4
2
 
5
- field :stages, Hash, :default => {}
6
- field :links, Array, :default => []
7
-
8
- def tsort_each_node(&blk)
9
- stages.keys.each(&blk)
3
+ module GraphInstanceMethods
4
+ def each_stage &block
5
+ stages.values.each(&block)
10
6
  end
11
7
 
12
- def tsort_each_child(node, &blk)
13
- links.select{ |link| link.into == node }.map(&:from).each(&blk)
8
+ def descendents stage=nil
9
+ links.find_all do |link|
10
+ stage ? link.from == stage.label : true
11
+ end.map(&:into).uniq.map { |label| stages[label] }.compact
14
12
  end
15
13
 
16
- def directed_sort() self.tsort ; end
14
+ def ancestors stage=nil
15
+ links.find_all do |link|
16
+ stage ? link.into == stage.label : true
17
+ end.map(&:from).uniq.map { |label| stages[label] }.compact
18
+ end
19
+
20
+ def add_stage stage
21
+ stages[stage.label] = stage
22
+ end
23
+
24
+ def has_link? from, into
25
+ links.detect { |link| link.from == from.label && link.into == into.label } ? true : false
26
+ end
27
+
28
+ def add_link type, from, into
29
+ add_stage(from)
30
+ add_stage(into)
31
+ self.links << Hanuman::LinkFactory.connect(type, from.linkable_name(:in), into.linkable_name(:out))
32
+ end
33
+ end
34
+
35
+ class Graph < Stage
36
+ include GraphInstanceMethods
37
+
38
+ field :stages, Hash, :default => {}
39
+ field :links, Array, :default => []
17
40
  end
18
41
 
19
42
  class GraphBuilder < StageBuilder
20
- include TSort
21
43
 
44
+ include GraphInstanceMethods
45
+
22
46
  field :stages, Hash, :default => {}
23
47
  field :links, Array, :default => []
24
48
 
25
49
  def define(&blk)
26
- graph = for_class || define_class(label)
50
+ graph = for_class || define_class(label)
27
51
  self.instance_eval(&blk) if block_given?
28
52
  extract_links!
29
53
  graph.register
@@ -49,6 +73,7 @@ module Hanuman
49
73
  end
50
74
 
51
75
  def extract_links!
76
+ self.links.replace([])
52
77
  stages.each_pair{ |name, builder| links << builder.links }
53
78
  links.flatten!
54
79
  end
@@ -60,21 +85,13 @@ module Hanuman
60
85
  attrs.merge(args)
61
86
  end
62
87
 
63
- def tsort_each_node(&blk)
64
- stages.keys.each(&blk)
65
- end
66
-
67
- def tsort_each_child(node, &blk)
68
- links.select{ |link| link.into == node }.map(&:from).each(&blk)
69
- end
70
-
71
- def directed_sort() self.tsort ; end
72
-
73
88
  def clone
74
89
  cloned_attrs = Hash[ serialize.select{ |key, val| key != :stages }.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
75
90
  cloned_links = links.map{ |link| link.dup }
76
91
  cloned_stages = Hash[ stages.map{ |stage| stage.clone } ]
77
92
  self.class.receive(cloned_attrs.merge(links: cloned_links).merge(stages: cloned_stages).merge(for_class: for_class))
78
93
  end
94
+
79
95
  end
96
+
80
97
  end
@@ -1,4 +1,22 @@
1
1
  module Hanuman
2
+
3
+ module StageInstanceMethods
4
+
5
+ attr_accessor :graph
6
+
7
+ def linkable_name(direction) ; self.label ; end
8
+
9
+ def add_link(type, other_stage)
10
+ self.links << Hanuman::LinkFactory.connect(type, linkable_name(:in), other_stage.linkable_name(:out))
11
+ graph.add_link(type, self, other_stage) if graph
12
+ end
13
+
14
+ def root
15
+ graph ? graph.root(self) : self
16
+ end
17
+
18
+ end
19
+
2
20
  module StageClassMethods
3
21
 
4
22
  def label() self.to_s.demodulize.underscore.to_sym ; end
@@ -23,13 +41,23 @@ module Hanuman
23
41
 
24
42
  class Stage
25
43
  include Gorillib::Model
44
+ include StageInstanceMethods
26
45
  extend StageClassMethods
27
46
 
28
47
  field :label, Symbol, :doc => false
29
- end
48
+ field :links, Array, :default => [], doc: false
30
49
 
50
+ def clone
51
+ cloned_attrs = Hash[ attributes.map{ |key, val| dup_key = key.dup rescue key ; dup_val = val.dup rescue val ; [ dup_key, dup_val ] } ]
52
+ cloned_links = links.map{ |link| link.dup }
53
+ self.class.receive(cloned_attrs.merge(links: cloned_links))
54
+ end
55
+ end
56
+
31
57
  class StageBuilder
58
+
32
59
  include Gorillib::Model
60
+ include StageInstanceMethods
33
61
 
34
62
  field :args, Hash, :default => {}
35
63
  field :for_class, Class
@@ -65,18 +93,6 @@ module Hanuman
65
93
  klass
66
94
  end
67
95
 
68
- def linkable_name(direction) self.label ; end
69
-
70
- def add_link(level, from, into)
71
- links << Hanuman::LinkFactory.connect(level, from, into)
72
- end
73
-
74
- def into(other_stage)
75
- self.add_link(:simple, self.linkable_name(:in), other_stage.linkable_name(:out))
76
- other_stage
77
- end
78
- alias_method :|, :into
79
-
80
96
  def serialize()
81
97
  attrs = attributes
82
98
  args = attrs.delete(:args)
@@ -91,5 +107,22 @@ module Hanuman
91
107
  self.class.receive(cloned_attrs.merge(links: cloned_links).merge(for_class: for_class))
92
108
  end
93
109
 
110
+ def into(stage_or_stages)
111
+ return stage_or_stages if stage_or_stages.nil?
112
+ if stage_or_stages.is_a?(Array)
113
+ stage_or_stages.each do |other_stage_or_stages|
114
+ while other_stage_or_stages.is_a?(Array)
115
+ other_stage_or_stages = other_stage_or_stages.first
116
+ end
117
+ other_stage = other_stage_or_stages
118
+ self.into(other_stage)
119
+ end
120
+ else
121
+ self.add_link(:simple, stage_or_stages.root)
122
+ end
123
+ stage_or_stages
124
+ end
125
+ alias_method :|, :into
126
+
94
127
  end
95
128
  end
@@ -0,0 +1,67 @@
1
+ require 'tsort'
2
+ module Hanuman
3
+
4
+ module TreeInstanceMethods
5
+ include TSort
6
+
7
+ MultipleRoots = Class.new(TSort::Cyclic)
8
+
9
+ def tsort_each_node(&blk)
10
+ stages.keys.each(&blk)
11
+ end
12
+
13
+ def tsort_each_child(label, &blk)
14
+ links.select { |link| link.into == label }.map(&:from).each(&blk)
15
+ end
16
+
17
+ def directed_sort() self.tsort ; end
18
+
19
+ def each_stage &blk
20
+ directed_sort.map { |label| stages[label]}.compact.each(&blk)
21
+ end
22
+
23
+ def root stage=nil
24
+ return stages[directed_sort.first] unless stage
25
+ return stage unless ancestor(stage)
26
+ self.root(ancestor(stage))
27
+ end
28
+
29
+ def ancestor(stage)
30
+ ancestors(stage).first
31
+ end
32
+
33
+ def leaves
34
+ the_leaves = (descendents - ancestors)
35
+ the_leaves.empty? ? [root] : the_leaves
36
+ end
37
+
38
+ def add_link type, from, into
39
+ return if has_link?(from, into)
40
+ raise TSort::Cyclic.new("Cannot link from a stage <#{from.label}> to itself") if into == from
41
+ raise MultipleRoots.new("Cannot link from <#{from.label}> to <#{into.label}> because <#{into.label}> aleady has an ancestor <#{ancestor(into).label}>") if ancestor(into)
42
+ raise TSort::Cyclic.new("Cannot link from leaf <#{from.label}> to the root <#{into.label}>") if into == root && leaves.include?(from)
43
+ super(type, from, into)
44
+ end
45
+
46
+ def prepend stage
47
+ add_link(:simple, stage, root)
48
+ end
49
+
50
+ def append stage
51
+ leaves.each do |leaf|
52
+ stage_for_leaf = stage.clone
53
+ stage_for_leaf.label = "#{stage_for_leaf.label}_for_#{leaf.label}".to_sym
54
+ add_link(:simple, leaf, stage_for_leaf)
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ class Tree < Graph
61
+ include TreeInstanceMethods
62
+ end
63
+
64
+ class TreeBuilder < GraphBuilder
65
+ include TreeInstanceMethods
66
+ end
67
+ end