wukong 3.0.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -16,13 +16,15 @@ module Wukong
16
16
  def self.configure settings, program
17
17
  case program
18
18
  when 'wu-local'
19
- settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
20
- settings.define :tcp_port, description: "Consume TCP requests on the given port instead of lines over STDIN", type: Integer, flag: 't'
19
+ settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of first argument", flag: 'r'
21
20
 
22
21
  settings.define :from, description: "Parse input from given data format (json, tsv, &c.) before processing"
23
22
  settings.define :to, description: "Convert input to given data format (json, tsv, &c.) before emitting"
24
-
25
- settings.define :consumes, description: "Parse input as instances of given model class before processing", type: Class
23
+ settings.define :as, description: "Call Class.receive on each input (will run after --from)", type: Class
24
+ when 'wu-source'
25
+ settings.define :per_sec, description: "Number of events produced per second", type: Float
26
+ settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
27
+ settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
26
28
  end
27
29
  end
28
30
 
@@ -1,5 +1,4 @@
1
1
  require_relative 'stdio_driver'
2
- require_relative 'tcp_driver'
3
2
 
4
3
  module Wukong
5
4
  module Local
@@ -44,10 +43,10 @@ module Wukong
44
43
  clever
45
44
  EOF
46
45
 
47
- # Returns the name of the processor we're going to run.
46
+ # Returns the name of the dataflow we're going to run.
48
47
  #
49
48
  # @return [String]
50
- def processor
49
+ def dataflow
51
50
  arg = args.first
52
51
  basename = File.basename(arg.to_s, '.rb')
53
52
 
@@ -57,14 +56,15 @@ module Wukong
57
56
  else arg
58
57
  end
59
58
  end
59
+ alias_method :processor, :dataflow
60
60
 
61
61
  # Validates the chosen processor.
62
62
  #
63
63
  # @raise [Wukong::Error] if it finds a problem
64
64
  # @return [true]
65
65
  def validate
66
- raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if processor.nil? || processor.empty?
67
- raise Error.new("No such processor or dataflow <#{processor}>") unless registered?(processor)
66
+ raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if dataflow.nil? || dataflow.empty?
67
+ raise Error.new("No such processor or dataflow <#{dataflow}>") unless registered?(dataflow)
68
68
  true
69
69
  end
70
70
 
@@ -72,25 +72,23 @@ module Wukong
72
72
  # # itself.
73
73
  def setup
74
74
  super()
75
- dataflow_class_for(processor).configure(settings) if processor?(processor)
75
+ dataflow_class_for(dataflow).configure(settings) if registered?(dataflow)
76
76
  end
77
77
 
78
- # Runs either the StdioDriver or the TCPDriver, depending on
79
- # what settings were passed.
78
+ # Starts up the driver with the right dataflow and settings.
79
+ #
80
+ # Starts the EventMachine reactor before starting the driver.
80
81
  def run
81
- EM.run do
82
- driver.start(processor, settings)
82
+ EM.run do
83
+ driver.start(dataflow, settings)
83
84
  end
84
85
  end
85
86
 
86
- # The driver this Runner will use.
87
- #
88
- # Defaults to the Wukong::Local::StdioDriver, but will use the
89
- # TcpDriver if it has a :port setting defined.
87
+ # The class used
90
88
  #
91
- # @return [Wukong::Local::TCPDriver, Wukong::Local::StdioDriver]
89
+ # @return [Class, #start]
92
90
  def driver
93
- (settings[:tcp_port] ? TCPDriver : StdioDriver)
91
+ StdioDriver
94
92
  end
95
93
 
96
94
  end
@@ -1,33 +1,74 @@
1
- require_relative('event_machine_driver')
2
1
  module Wukong
3
2
  module Local
4
3
 
5
4
  # A class for driving processors over the STDIN/STDOUT protocol.
5
+ #
6
+ # Relies on EventMachine's [LineAndTextProtocol](http://eventmachine.rubyforge.org/EventMachine/Protocols/LineText2.html).
6
7
  class StdioDriver < EM::P::LineAndTextProtocol
7
- include EventMachineDriver
8
- include Processor::StdoutProcessor
9
- include Logging
10
8
 
9
+ include DriverMethods
10
+ include Logging
11
+
12
+ #
13
+ # == Startup ==
14
+ #
15
+
16
+ # Start a new StdioDriver.
17
+ #
18
+ # @param [Symbol] the name of the processor or dataflow to drive
19
+ # @param [Configliere::Param] settings the settings to use
11
20
  def self.start(label, settings = {})
12
21
  EM.attach($stdin, self, label, settings)
13
22
  end
14
23
 
24
+ # :nodoc:
25
+ def initialize(label, settings)
26
+ super
27
+ construct_dataflow(label, settings)
28
+ end
29
+
30
+ # Ensures that $stdout is synced.
31
+ def setup()
32
+ $stdout.sync
33
+ end
34
+
35
+ # Adds signal traps for SIGINT and SIGTERM to Ensure we capture
36
+ # C-c and friends, stop the EventMachine reactor, &c.
37
+ def self.add_signal_traps
38
+ Signal.trap('INT') { log.info 'Received SIGINT. Stopping.' ; EM.stop }
39
+ Signal.trap('TERM') { log.info 'Received SIGTERM. Stopping.' ; EM.stop }
40
+ end
41
+
42
+ # Called by EventMachine framework after successfully attaching
43
+ # to $stdin.
44
+ #
45
+ # Adds signal handlers and calls the #setup_dataflow method.
15
46
  def post_init
16
47
  self.class.add_signal_traps
17
48
  setup_dataflow
18
49
  end
19
-
50
+
51
+ #
52
+ # == Reading Input ==
53
+ #
54
+
55
+ # Called by EventMachine framework after successfully reading a
56
+ # line from $stdin.
57
+ #
58
+ # @param [String] line
20
59
  def receive_line line
21
- driver.send_through_dataflow(line)
60
+ send_through_dataflow(line)
22
61
  rescue => e
23
62
  error = Wukong::Error.new(e)
24
- EM.stop
63
+ # EM.stop
25
64
 
26
- # We'd to *raise* `error` here and have it be handled by
27
- # Wukong::Runner.run but we are fighting with EventMachine.
28
- # It seems no matter what we do, EventMachine will swallow any
29
- # Exception raised here (including SystemExit) and exit the
30
- # Ruby process with a return code of 0.
65
+ # We'd like to *raise* `error` here and have it be handled by
66
+ # Wukong::Runner.run but we are fighting with EventMachine.run
67
+ # which executes in the middle.
68
+ #
69
+ # It seems no matter what we do, EventMachine.run will swallow
70
+ # any Exception raised here (including SystemExit) and exit
71
+ # the Ruby process with a return code of 0.
31
72
  #
32
73
  # Instead we just log the message that *would* have gotten
33
74
  # logged by Wukong::Runner.run and leave it to EventMachine to
@@ -35,6 +76,25 @@ module Wukong
35
76
  log.error(error.message)
36
77
  end
37
78
 
79
+ #
80
+ # == Handling Output ==
81
+ #
82
+
83
+ # Writes a record to $stdout.
84
+ #
85
+ # @param [#to_s] record
86
+ def process(record)
87
+ $stdout.puts record
88
+ end
89
+
90
+ #
91
+ # == Shutdown ==
92
+ #
93
+
94
+ # Called by EventMachine framework after EOF from $stdin.
95
+ #
96
+ # Calls #finalize_and_stop_dataflow method and stops the
97
+ # EventMachine reactor.
38
98
  def unbind
39
99
  finalize_and_stop_dataflow
40
100
  EM.stop
@@ -13,7 +13,6 @@ module Wukong
13
13
  # local machine. You can glue processors together
14
14
  class Processor < Hanuman::Stage
15
15
  include Logging
16
- include Vayacondios::Notifications
17
16
 
18
17
  field :action, Whatever, :doc => false
19
18
 
@@ -23,32 +22,12 @@ module Wukong
23
22
  @description = desc if desc
24
23
  @description
25
24
  end
26
-
27
- def consumes(*args)
28
- options = args.extract_options!
29
- @consumes = options[:as]
30
- validate_and_set_serialization(:from, args.first)
31
- end
32
-
33
- def produces(*args)
34
- options = args.extract_options!
35
- @produces = options[:as]
36
- validate_and_set_serialization(:to, args.first)
37
- end
38
25
 
39
- def valid_serializer? label
40
- label
41
- end
42
-
43
- def validate_and_set_serialization(direction, label)
44
- instance_variable_set("@serialization_#{direction}", label) if %w[ tsv json xml ].include?(label.to_s)
45
- end
46
-
47
26
  def configure(settings)
48
27
  settings.description = description if description
49
28
  fields.each_pair do |name, field|
50
29
  next if field.doc == false || field.doc.to_s == 'false'
51
- next if [:log, :notifier].include?(name)
30
+ next if [:log].include?(name)
52
31
  field_props = {}.tap do |props|
53
32
  props[:description] = field.doc unless field.doc == "#{name} field"
54
33
  field_type = (field.type.respond_to?(:product) ? field.type.product : field.type)
@@ -69,14 +48,6 @@ module Wukong
69
48
 
70
49
  end
71
50
 
72
- def expected_record_type(type)
73
- self.class.instance_variable_get("@#{type}")
74
- end
75
-
76
- def expected_serialization(direction)
77
- self.class.instance_variable_get("@serialization_#{direction.to_s}")
78
- end
79
-
80
51
  # When instantiated with a block, the block will replace this
81
52
  # method.
82
53
  #
@@ -1,6 +1,7 @@
1
1
  require_relative("runner/code_loader")
2
2
  require_relative("runner/deploy_pack_loader")
3
3
  require_relative("runner/boot_sequence")
4
+ require_relative("runner/command_runner")
4
5
 
5
6
  module Wukong
6
7
 
@@ -18,6 +19,7 @@ module Wukong
18
19
  include CodeLoader
19
20
  include DeployPackLoader
20
21
  include BootSequence
22
+ include CommandRunner
21
23
 
22
24
  # The settings object that will be configured and booted from.
23
25
  # All plugins will configure this object.
@@ -0,0 +1,44 @@
1
+ module Wukong
2
+ class Runner
3
+
4
+ # Provides methods for executing commandlines.
5
+ module CommandRunner
6
+
7
+ private
8
+
9
+ # Execute a command composed of the given parts.
10
+ #
11
+ # Will print the command instead if the <tt>--dry_run</tt>
12
+ # option was given.
13
+ #
14
+ # Will *not* raise an error if the command fails.
15
+ #
16
+ # @param [Array<String>] argv
17
+ def execute_command(*argv)
18
+ command = argv.flatten.reject(&:blank?).join(" \\\n ")
19
+ if settings[:dry_run]
20
+ log.info("Dry run:")
21
+ puts command
22
+ else
23
+ output = `#{command}`
24
+ puts output unless output.empty?
25
+ end
26
+ end
27
+
28
+ # Execute a command composed of the given parts.
29
+ #
30
+ # Will print the command instead if the <tt>--dry_run</tt>
31
+ # option was given.
32
+ #
33
+ # *Will* raise an error if the command fails.
34
+ #
35
+ # @param [Array<String>] argv
36
+ def execute_command!(*argv)
37
+ execute_command(argv)
38
+ raise Error.new("Command failed!") unless $?.success?
39
+ end
40
+
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ module Wukong
2
+
3
+ # Provides a runner for periodically triggering a dataflow or
4
+ # processor.
5
+ module Source
6
+ include Plugin
7
+
8
+ # Configures the given +settings+ object with all settings
9
+ # specific to Wukong::Source for the given program +name+.
10
+ #
11
+ # @param [Configliere::Param] settings the settings to configure
12
+ # @param [String] program the name of the currently executing program
13
+ def self.configure settings, program
14
+ case program
15
+ when 'wu-source'
16
+ settings.define :per_sec, description: "Number of events produced per second", type: Float
17
+ settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
18
+ settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
19
+ end
20
+ end
21
+
22
+ # Boots Wukong::Source using the given +settings+ at the given
23
+ # +root.
24
+ #
25
+ # @param [Configliere::Param] settings the settings to use to boot
26
+ # @param [String] root the root directory to boot in
27
+ def self.boot(settings, root)
28
+ end
29
+
30
+ end
31
+ end
32
+
33
+ require_relative('source/source_runner')
@@ -0,0 +1,74 @@
1
+ module Wukong
2
+ module Source
3
+
4
+ # A driver which works just like the `Wukong::Local::StdioDriver`
5
+ # except it ignores input from `STDIN` and instead generates its
6
+ # own input records according to some periodic schedule. Each
7
+ # consecutive record produced will be an incrementing positive
8
+ # integer (as a string), starting with '1'.
9
+ class SourceDriver < Wukong::Local::StdioDriver
10
+
11
+ include Logging
12
+
13
+ # The index of the record.
14
+ attr_accessor :index
15
+
16
+ # The number of records after which a `Processor#finalize` will
17
+ # be called.
18
+ attr_accessor :batch_size
19
+
20
+ # Sets the initial value of `index` to 1 and sets the batch size
21
+ # (only if it's positive).
22
+ def post_init
23
+ super()
24
+ self.index = 1
25
+ self.batch_size = settings[:batch_size].to_i if settings[:batch_size] && settings[:batch_size].to_i > 0
26
+ end
27
+
28
+ # Starts periodically feeding the processor or dataflow given by
29
+ # `label` using the given `settings`.
30
+ #
31
+ # @param [String, Symbol] label
32
+ # @param [Configliere::Param, Hash] settings
33
+ def self.start(label, settings={})
34
+ driver = new(:foobar, label, settings) # i don't think the 1st argument matters here...
35
+ driver.post_init
36
+
37
+ period = case
38
+ when settings[:period] then settings[:period]
39
+ when settings[:per_sec] then (1.0 / settings[:per_sec]) rescue 1.0
40
+ else 1.0
41
+ end
42
+ driver.create_event
43
+ EventMachine::PeriodicTimer.new(period) { driver.create_event }
44
+ end
45
+
46
+ # Creates a new event using the following steps:
47
+ #
48
+ # 1. Feeds a record with the existing `index` to the dataflow.
49
+ # 2. Increments the `index`.
50
+ # 3. Finalizes the dataflow if the number of records is a
51
+ # multiple of the `batch_size`.
52
+ #
53
+ # @see DriverMethods
54
+ def create_event
55
+ receive_line(index.to_s)
56
+ self.index += 1
57
+ finalize_dataflow if self.batch_size && (self.index % self.batch_size) == 0
58
+ end
59
+
60
+ # Outputs a `record` from the dataflow or processor to `STDOUT`.
61
+ #
62
+ # `STDOUT` will automatically be flushed to force output to
63
+ # prevent the feeling of "no output" when the looping period is
64
+ # long.
65
+ #
66
+ # @param [Object] record the record yielded by the processor or the terminal node(s) of the dataflow
67
+ def process record
68
+ $stdout.puts record
69
+ $stdout.flush
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,38 @@
1
+ require_relative('source_driver')
2
+ module Wukong
3
+ module Source
4
+
5
+ # Implements the `wu-source` command.
6
+ class SourceRunner < Wukong::Local::LocalRunner
7
+
8
+ usage "PROCESSOR|DATAFLOW"
9
+
10
+ description <<-EOF.gsub(/^ {8}/,'')
11
+ wu-source is a tool for using Wukong processors as sources of
12
+ data in streams.
13
+
14
+ Run any Wukong processor as a source for data:
15
+
16
+ $ wu-source fake_log_data
17
+ 205.4.75.208 - 3918471017 [27/Nov/2012:05:06:57 -0600] "GET /products/eget HTTP/1.0" 200 25600
18
+ 63.181.105.15 - 3650805763 [27/Nov/2012:05:06:57 -0600] "GET /products/lacinia-nulla-vitae HTTP/1.0" 200 3790
19
+ 227.190.78.101 - 39543891 [27/Nov/2012:05:06:58 -0600] "GET /products/odio-nulla-nulla-ipsum HTTP/1.0" 200 31718
20
+ ...
21
+
22
+ The fake_log_data processor will receive an event once every
23
+ second. Each event will consist of a single string giving a
24
+ consecutive integer starting with '1' as the first event.
25
+ EOF
26
+
27
+ include Logging
28
+
29
+ # The driver class used by `wu-source`.
30
+ #
31
+ # @return [Class]
32
+ def driver
33
+ SourceDriver
34
+ end
35
+
36
+ end
37
+ end
38
+ end