wukong 3.0.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +1 -1
  3. data/README.md +253 -45
  4. data/bin/wu +34 -0
  5. data/bin/wu-source +5 -0
  6. data/examples/Gemfile +0 -1
  7. data/examples/deploy_pack/Gemfile +0 -1
  8. data/examples/improver/tweet_summary.rb +73 -0
  9. data/examples/ruby_project/Gemfile +0 -1
  10. data/examples/splitter.rb +94 -0
  11. data/examples/twitter.rb +5 -0
  12. data/lib/hanuman.rb +1 -1
  13. data/lib/hanuman/graph.rb +39 -22
  14. data/lib/hanuman/stage.rb +46 -13
  15. data/lib/hanuman/tree.rb +67 -0
  16. data/lib/wukong.rb +6 -1
  17. data/lib/wukong/dataflow.rb +19 -48
  18. data/lib/wukong/driver.rb +176 -65
  19. data/lib/wukong/{local → driver}/event_machine_driver.rb +1 -13
  20. data/lib/wukong/driver/wiring.rb +68 -0
  21. data/lib/wukong/local.rb +6 -4
  22. data/lib/wukong/local/runner.rb +14 -16
  23. data/lib/wukong/local/stdio_driver.rb +72 -12
  24. data/lib/wukong/processor.rb +1 -30
  25. data/lib/wukong/runner.rb +2 -0
  26. data/lib/wukong/runner/command_runner.rb +44 -0
  27. data/lib/wukong/source.rb +33 -0
  28. data/lib/wukong/source/source_driver.rb +74 -0
  29. data/lib/wukong/source/source_runner.rb +38 -0
  30. data/lib/wukong/spec_helpers/shared_examples.rb +0 -1
  31. data/lib/wukong/spec_helpers/unit_tests.rb +6 -5
  32. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +4 -14
  33. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +7 -8
  34. data/lib/wukong/version.rb +1 -1
  35. data/lib/wukong/widget/echo.rb +55 -0
  36. data/lib/wukong/widget/{processors.rb → extract.rb} +0 -106
  37. data/lib/wukong/widget/filters.rb +15 -0
  38. data/lib/wukong/widget/logger.rb +56 -0
  39. data/lib/wukong/widget/operators.rb +82 -0
  40. data/lib/wukong/widget/reducers.rb +2 -0
  41. data/lib/wukong/widget/reducers/improver.rb +71 -0
  42. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  43. data/lib/wukong/widget/serializers.rb +21 -6
  44. data/lib/wukong/widgets.rb +6 -3
  45. data/spec/hanuman/graph_spec.rb +73 -10
  46. data/spec/hanuman/stage_spec.rb +15 -0
  47. data/spec/hanuman/tree_spec.rb +119 -0
  48. data/spec/spec_helper.rb +13 -1
  49. data/spec/support/example_test_helpers.rb +0 -1
  50. data/spec/support/model_test_helpers.rb +1 -1
  51. data/spec/support/shared_context_for_graphs.rb +57 -0
  52. data/spec/support/shared_examples_for_builders.rb +8 -15
  53. data/spec/wukong/driver_spec.rb +152 -0
  54. data/spec/wukong/local/runner_spec.rb +1 -12
  55. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  56. data/spec/wukong/processor_spec.rb +0 -1
  57. data/spec/wukong/runner_spec.rb +2 -2
  58. data/spec/wukong/source_spec.rb +6 -0
  59. data/spec/wukong/widget/extract_spec.rb +101 -0
  60. data/spec/wukong/widget/logger_spec.rb +23 -0
  61. data/spec/wukong/widget/operators_spec.rb +25 -0
  62. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  63. data/spec/wukong/wu-source_spec.rb +32 -0
  64. data/spec/wukong/wu_spec.rb +14 -0
  65. data/wukong.gemspec +1 -2
  66. metadata +45 -28
  67. data/lib/wukong/local/tcp_driver.rb +0 -47
  68. data/spec/wu/geo/geolocated_spec.rb +0 -247
  69. data/spec/wukong/widget/processors_spec.rb +0 -125
@@ -16,13 +16,15 @@ module Wukong
16
16
  def self.configure settings, program
17
17
  case program
18
18
  when 'wu-local'
19
- settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
20
- settings.define :tcp_port, description: "Consume TCP requests on the given port instead of lines over STDIN", type: Integer, flag: 't'
19
+ settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of first argument", flag: 'r'
21
20
 
22
21
  settings.define :from, description: "Parse input from given data format (json, tsv, &c.) before processing"
23
22
  settings.define :to, description: "Convert input to given data format (json, tsv, &c.) before emitting"
24
-
25
- settings.define :consumes, description: "Parse input as instances of given model class before processing", type: Class
23
+ settings.define :as, description: "Call Class.receive on each input (will run after --from)", type: Class
24
+ when 'wu-source'
25
+ settings.define :per_sec, description: "Number of events produced per second", type: Float
26
+ settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
27
+ settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
26
28
  end
27
29
  end
28
30
 
@@ -1,5 +1,4 @@
1
1
  require_relative 'stdio_driver'
2
- require_relative 'tcp_driver'
3
2
 
4
3
  module Wukong
5
4
  module Local
@@ -44,10 +43,10 @@ module Wukong
44
43
  clever
45
44
  EOF
46
45
 
47
- # Returns the name of the processor we're going to run.
46
+ # Returns the name of the dataflow we're going to run.
48
47
  #
49
48
  # @return [String]
50
- def processor
49
+ def dataflow
51
50
  arg = args.first
52
51
  basename = File.basename(arg.to_s, '.rb')
53
52
 
@@ -57,14 +56,15 @@ module Wukong
57
56
  else arg
58
57
  end
59
58
  end
59
+ alias_method :processor, :dataflow
60
60
 
61
61
  # Validates the chosen processor.
62
62
  #
63
63
  # @raise [Wukong::Error] if it finds a problem
64
64
  # @return [true]
65
65
  def validate
66
- raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if processor.nil? || processor.empty?
67
- raise Error.new("No such processor or dataflow <#{processor}>") unless registered?(processor)
66
+ raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if dataflow.nil? || dataflow.empty?
67
+ raise Error.new("No such processor or dataflow <#{dataflow}>") unless registered?(dataflow)
68
68
  true
69
69
  end
70
70
 
@@ -72,25 +72,23 @@ module Wukong
72
72
  # # itself.
73
73
  def setup
74
74
  super()
75
- dataflow_class_for(processor).configure(settings) if processor?(processor)
75
+ dataflow_class_for(dataflow).configure(settings) if registered?(dataflow)
76
76
  end
77
77
 
78
- # Runs either the StdioDriver or the TCPDriver, depending on
79
- # what settings were passed.
78
+ # Starts up the driver with the right dataflow and settings.
79
+ #
80
+ # Starts the EventMachine reactor before starting the driver.
80
81
  def run
81
- EM.run do
82
- driver.start(processor, settings)
82
+ EM.run do
83
+ driver.start(dataflow, settings)
83
84
  end
84
85
  end
85
86
 
86
- # The driver this Runner will use.
87
- #
88
- # Defaults to the Wukong::Local::StdioDriver, but will use the
89
- # TcpDriver if it has a :port setting defined.
87
+ # The class used
90
88
  #
91
- # @return [Wukong::Local::TCPDriver, Wukong::Local::StdioDriver]
89
+ # @return [Class, #start]
92
90
  def driver
93
- (settings[:tcp_port] ? TCPDriver : StdioDriver)
91
+ StdioDriver
94
92
  end
95
93
 
96
94
  end
@@ -1,33 +1,74 @@
1
- require_relative('event_machine_driver')
2
1
  module Wukong
3
2
  module Local
4
3
 
5
4
  # A class for driving processors over the STDIN/STDOUT protocol.
5
+ #
6
+ # Relies on EventMachine's [LineAndTextProtocol](http://eventmachine.rubyforge.org/EventMachine/Protocols/LineText2.html).
6
7
  class StdioDriver < EM::P::LineAndTextProtocol
7
- include EventMachineDriver
8
- include Processor::StdoutProcessor
9
- include Logging
10
8
 
9
+ include DriverMethods
10
+ include Logging
11
+
12
+ #
13
+ # == Startup ==
14
+ #
15
+
16
+ # Start a new StdioDriver.
17
+ #
18
+ # @param [Symbol] the name of the processor or dataflow to drive
19
+ # @param [Configliere::Param] settings the settings to use
11
20
  def self.start(label, settings = {})
12
21
  EM.attach($stdin, self, label, settings)
13
22
  end
14
23
 
24
+ # :nodoc:
25
+ def initialize(label, settings)
26
+ super
27
+ construct_dataflow(label, settings)
28
+ end
29
+
30
+ # Ensures that $stdout is synced.
31
+ def setup()
32
+ $stdout.sync
33
+ end
34
+
35
+ # Adds signal traps for SIGINT and SIGTERM to Ensure we capture
36
+ # C-c and friends, stop the EventMachine reactor, &c.
37
+ def self.add_signal_traps
38
+ Signal.trap('INT') { log.info 'Received SIGINT. Stopping.' ; EM.stop }
39
+ Signal.trap('TERM') { log.info 'Received SIGTERM. Stopping.' ; EM.stop }
40
+ end
41
+
42
+ # Called by EventMachine framework after successfully attaching
43
+ # to $stdin.
44
+ #
45
+ # Adds signal handlers and calls the #setup_dataflow method.
15
46
  def post_init
16
47
  self.class.add_signal_traps
17
48
  setup_dataflow
18
49
  end
19
-
50
+
51
+ #
52
+ # == Reading Input ==
53
+ #
54
+
55
+ # Called by EventMachine framework after successfully reading a
56
+ # line from $stdin.
57
+ #
58
+ # @param [String] line
20
59
  def receive_line line
21
- driver.send_through_dataflow(line)
60
+ send_through_dataflow(line)
22
61
  rescue => e
23
62
  error = Wukong::Error.new(e)
24
- EM.stop
63
+ # EM.stop
25
64
 
26
- # We'd to *raise* `error` here and have it be handled by
27
- # Wukong::Runner.run but we are fighting with EventMachine.
28
- # It seems no matter what we do, EventMachine will swallow any
29
- # Exception raised here (including SystemExit) and exit the
30
- # Ruby process with a return code of 0.
65
+ # We'd like to *raise* `error` here and have it be handled by
66
+ # Wukong::Runner.run but we are fighting with EventMachine.run
67
+ # which executes in the middle.
68
+ #
69
+ # It seems no matter what we do, EventMachine.run will swallow
70
+ # any Exception raised here (including SystemExit) and exit
71
+ # the Ruby process with a return code of 0.
31
72
  #
32
73
  # Instead we just log the message that *would* have gotten
33
74
  # logged by Wukong::Runner.run and leave it to EventMachine to
@@ -35,6 +76,25 @@ module Wukong
35
76
  log.error(error.message)
36
77
  end
37
78
 
79
+ #
80
+ # == Handling Output ==
81
+ #
82
+
83
+ # Writes a record to $stdout.
84
+ #
85
+ # @param [#to_s] record
86
+ def process(record)
87
+ $stdout.puts record
88
+ end
89
+
90
+ #
91
+ # == Shutdown ==
92
+ #
93
+
94
+ # Called by EventMachine framework after EOF from $stdin.
95
+ #
96
+ # Calls #finalize_and_stop_dataflow method and stops the
97
+ # EventMachine reactor.
38
98
  def unbind
39
99
  finalize_and_stop_dataflow
40
100
  EM.stop
@@ -13,7 +13,6 @@ module Wukong
13
13
  # local machine. You can glue processors together
14
14
  class Processor < Hanuman::Stage
15
15
  include Logging
16
- include Vayacondios::Notifications
17
16
 
18
17
  field :action, Whatever, :doc => false
19
18
 
@@ -23,32 +22,12 @@ module Wukong
23
22
  @description = desc if desc
24
23
  @description
25
24
  end
26
-
27
- def consumes(*args)
28
- options = args.extract_options!
29
- @consumes = options[:as]
30
- validate_and_set_serialization(:from, args.first)
31
- end
32
-
33
- def produces(*args)
34
- options = args.extract_options!
35
- @produces = options[:as]
36
- validate_and_set_serialization(:to, args.first)
37
- end
38
25
 
39
- def valid_serializer? label
40
- label
41
- end
42
-
43
- def validate_and_set_serialization(direction, label)
44
- instance_variable_set("@serialization_#{direction}", label) if %w[ tsv json xml ].include?(label.to_s)
45
- end
46
-
47
26
  def configure(settings)
48
27
  settings.description = description if description
49
28
  fields.each_pair do |name, field|
50
29
  next if field.doc == false || field.doc.to_s == 'false'
51
- next if [:log, :notifier].include?(name)
30
+ next if [:log].include?(name)
52
31
  field_props = {}.tap do |props|
53
32
  props[:description] = field.doc unless field.doc == "#{name} field"
54
33
  field_type = (field.type.respond_to?(:product) ? field.type.product : field.type)
@@ -69,14 +48,6 @@ module Wukong
69
48
 
70
49
  end
71
50
 
72
- def expected_record_type(type)
73
- self.class.instance_variable_get("@#{type}")
74
- end
75
-
76
- def expected_serialization(direction)
77
- self.class.instance_variable_get("@serialization_#{direction.to_s}")
78
- end
79
-
80
51
  # When instantiated with a block, the block will replace this
81
52
  # method.
82
53
  #
@@ -1,6 +1,7 @@
1
1
  require_relative("runner/code_loader")
2
2
  require_relative("runner/deploy_pack_loader")
3
3
  require_relative("runner/boot_sequence")
4
+ require_relative("runner/command_runner")
4
5
 
5
6
  module Wukong
6
7
 
@@ -18,6 +19,7 @@ module Wukong
18
19
  include CodeLoader
19
20
  include DeployPackLoader
20
21
  include BootSequence
22
+ include CommandRunner
21
23
 
22
24
  # The settings object that will be configured and booted from.
23
25
  # All plugins will configure this object.
@@ -0,0 +1,44 @@
1
+ module Wukong
2
+ class Runner
3
+
4
+ # Provides methods for executing commandlines.
5
+ module CommandRunner
6
+
7
+ private
8
+
9
+ # Execute a command composed of the given parts.
10
+ #
11
+ # Will print the command instead if the <tt>--dry_run</tt>
12
+ # option was given.
13
+ #
14
+ # Will *not* raise an error if the command fails.
15
+ #
16
+ # @param [Array<String>] argv
17
+ def execute_command(*argv)
18
+ command = argv.flatten.reject(&:blank?).join(" \\\n ")
19
+ if settings[:dry_run]
20
+ log.info("Dry run:")
21
+ puts command
22
+ else
23
+ output = `#{command}`
24
+ puts output unless output.empty?
25
+ end
26
+ end
27
+
28
+ # Execute a command composed of the given parts.
29
+ #
30
+ # Will print the command instead if the <tt>--dry_run</tt>
31
+ # option was given.
32
+ #
33
+ # *Will* raise an error if the command fails.
34
+ #
35
+ # @param [Array<String>] argv
36
+ def execute_command!(*argv)
37
+ execute_command(argv)
38
+ raise Error.new("Command failed!") unless $?.success?
39
+ end
40
+
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ module Wukong
2
+
3
+ # Provides a runner for periodically triggering a dataflow or
4
+ # processor.
5
+ module Source
6
+ include Plugin
7
+
8
+ # Configures the given +settings+ object with all settings
9
+ # specific to Wukong::Source for the given program +name+.
10
+ #
11
+ # @param [Configliere::Param] settings the settings to configure
12
+ # @param [String] program the name of the currently executing program
13
+ def self.configure settings, program
14
+ case program
15
+ when 'wu-source'
16
+ settings.define :per_sec, description: "Number of events produced per second", type: Float
17
+ settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
18
+ settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
19
+ end
20
+ end
21
+
22
+ # Boots Wukong::Source using the given +settings+ at the given
23
+ # +root.
24
+ #
25
+ # @param [Configliere::Param] settings the settings to use to boot
26
+ # @param [String] root the root directory to boot in
27
+ def self.boot(settings, root)
28
+ end
29
+
30
+ end
31
+ end
32
+
33
+ require_relative('source/source_runner')
@@ -0,0 +1,74 @@
1
+ module Wukong
2
+ module Source
3
+
4
+ # A driver which works just like the `Wukong::Local::StdioDriver`
5
+ # except it ignores input from `STDIN` and instead generates its
6
+ # own input records according to some periodic schedule. Each
7
+ # consecutive record produced will be an incrementing positive
8
+ # integer (as a string), starting with '1'.
9
+ class SourceDriver < Wukong::Local::StdioDriver
10
+
11
+ include Logging
12
+
13
+ # The index of the record.
14
+ attr_accessor :index
15
+
16
+ # The number of records after which a `Processor#finalize` will
17
+ # be called.
18
+ attr_accessor :batch_size
19
+
20
+ # Sets the initial value of `index` to 1 and sets the batch size
21
+ # (only if it's positive).
22
+ def post_init
23
+ super()
24
+ self.index = 1
25
+ self.batch_size = settings[:batch_size].to_i if settings[:batch_size] && settings[:batch_size].to_i > 0
26
+ end
27
+
28
+ # Starts periodically feeding the processor or dataflow given by
29
+ # `label` using the given `settings`.
30
+ #
31
+ # @param [String, Symbol] label
32
+ # @param [Configliere::Param, Hash] settings
33
+ def self.start(label, settings={})
34
+ driver = new(:foobar, label, settings) # i don't think the 1st argument matters here...
35
+ driver.post_init
36
+
37
+ period = case
38
+ when settings[:period] then settings[:period]
39
+ when settings[:per_sec] then (1.0 / settings[:per_sec]) rescue 1.0
40
+ else 1.0
41
+ end
42
+ driver.create_event
43
+ EventMachine::PeriodicTimer.new(period) { driver.create_event }
44
+ end
45
+
46
+ # Creates a new event using the following steps:
47
+ #
48
+ # 1. Feeds a record with the existing `index` to the dataflow.
49
+ # 2. Increments the `index`.
50
+ # 3. Finalizes the dataflow if the number of records is a
51
+ # multiple of the `batch_size`.
52
+ #
53
+ # @see DriverMethods
54
+ def create_event
55
+ receive_line(index.to_s)
56
+ self.index += 1
57
+ finalize_dataflow if self.batch_size && (self.index % self.batch_size) == 0
58
+ end
59
+
60
+ # Outputs a `record` from the dataflow or processor to `STDOUT`.
61
+ #
62
+ # `STDOUT` will automatically be flushed to force output to
63
+ # prevent the feeling of "no output" when the looping period is
64
+ # long.
65
+ #
66
+ # @param [Object] record the record yielded by the processor or the terminal node(s) of the dataflow
67
+ def process record
68
+ $stdout.puts record
69
+ $stdout.flush
70
+ end
71
+
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,38 @@
1
+ require_relative('source_driver')
2
+ module Wukong
3
+ module Source
4
+
5
+ # Implements the `wu-source` command.
6
+ class SourceRunner < Wukong::Local::LocalRunner
7
+
8
+ usage "PROCESSOR|DATAFLOW"
9
+
10
+ description <<-EOF.gsub(/^ {8}/,'')
11
+ wu-source is a tool for using Wukong processors as sources of
12
+ data in streams.
13
+
14
+ Run any Wukong processor as a source for data:
15
+
16
+ $ wu-source fake_log_data
17
+ 205.4.75.208 - 3918471017 [27/Nov/2012:05:06:57 -0600] "GET /products/eget HTTP/1.0" 200 25600
18
+ 63.181.105.15 - 3650805763 [27/Nov/2012:05:06:57 -0600] "GET /products/lacinia-nulla-vitae HTTP/1.0" 200 3790
19
+ 227.190.78.101 - 39543891 [27/Nov/2012:05:06:58 -0600] "GET /products/odio-nulla-nulla-ipsum HTTP/1.0" 200 31718
20
+ ...
21
+
22
+ The fake_log_data processor will receive an event once every
23
+ second. Each event will consist of a single string giving a
24
+ consecutive integer starting with '1' as the first event.
25
+ EOF
26
+
27
+ include Logging
28
+
29
+ # The driver class used by `wu-source`.
30
+ #
31
+ # @return [Class]
32
+ def driver
33
+ SourceDriver
34
+ end
35
+
36
+ end
37
+ end
38
+ end