wukong 3.0.0.pre3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/Gemfile +1 -0
  2. data/README.md +689 -50
  3. data/bin/wu-local +1 -74
  4. data/diagrams/wu_local.dot +39 -0
  5. data/diagrams/wu_local.dot.png +0 -0
  6. data/examples/loadable.rb +2 -0
  7. data/examples/string_reverser.rb +7 -0
  8. data/lib/hanuman/stage.rb +2 -2
  9. data/lib/wukong.rb +21 -10
  10. data/lib/wukong/dataflow.rb +2 -5
  11. data/lib/wukong/doc_helpers.rb +14 -0
  12. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  13. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  14. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  15. data/lib/wukong/driver.rb +11 -1
  16. data/lib/wukong/local.rb +40 -0
  17. data/lib/wukong/local/event_machine_driver.rb +27 -0
  18. data/lib/wukong/local/runner.rb +98 -0
  19. data/lib/wukong/local/stdio_driver.rb +44 -0
  20. data/lib/wukong/local/tcp_driver.rb +47 -0
  21. data/lib/wukong/logger.rb +16 -7
  22. data/lib/wukong/plugin.rb +48 -0
  23. data/lib/wukong/processor.rb +57 -15
  24. data/lib/wukong/rake_helper.rb +6 -0
  25. data/lib/wukong/runner.rb +151 -128
  26. data/lib/wukong/runner/boot_sequence.rb +123 -0
  27. data/lib/wukong/runner/code_loader.rb +52 -0
  28. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  29. data/lib/wukong/runner/help_message.rb +42 -0
  30. data/lib/wukong/spec_helpers.rb +4 -12
  31. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  32. data/lib/wukong/spec_helpers/{integration_driver_matchers.rb → integration_tests/integration_test_matchers.rb} +28 -62
  33. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  34. data/lib/wukong/spec_helpers/shared_examples.rb +19 -10
  35. data/lib/wukong/spec_helpers/unit_tests.rb +134 -0
  36. data/lib/wukong/spec_helpers/{processor_methods.rb → unit_tests/unit_test_driver.rb} +42 -8
  37. data/lib/wukong/spec_helpers/{spec_driver_matchers.rb → unit_tests/unit_test_matchers.rb} +6 -32
  38. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +54 -0
  39. data/lib/wukong/version.rb +1 -1
  40. data/lib/wukong/widget/filters.rb +134 -8
  41. data/lib/wukong/widget/processors.rb +64 -5
  42. data/lib/wukong/widget/reducers/bin.rb +68 -18
  43. data/lib/wukong/widget/reducers/count.rb +12 -0
  44. data/lib/wukong/widget/reducers/group.rb +48 -5
  45. data/lib/wukong/widget/reducers/group_concat.rb +30 -2
  46. data/lib/wukong/widget/reducers/moments.rb +4 -4
  47. data/lib/wukong/widget/reducers/sort.rb +53 -3
  48. data/lib/wukong/widget/serializers.rb +37 -12
  49. data/lib/wukong/widget/utils.rb +1 -1
  50. data/spec/spec_helper.rb +20 -2
  51. data/spec/wukong/driver_spec.rb +2 -0
  52. data/spec/wukong/local/runner_spec.rb +40 -0
  53. data/spec/wukong/local_spec.rb +6 -0
  54. data/spec/wukong/logger_spec.rb +49 -0
  55. data/spec/wukong/processor_spec.rb +22 -0
  56. data/spec/wukong/runner_spec.rb +128 -8
  57. data/spec/wukong/widget/filters_spec.rb +28 -10
  58. data/spec/wukong/widget/processors_spec.rb +5 -5
  59. data/spec/wukong/widget/reducers/bin_spec.rb +14 -14
  60. data/spec/wukong/widget/reducers/count_spec.rb +1 -1
  61. data/spec/wukong/widget/reducers/group_spec.rb +7 -6
  62. data/spec/wukong/widget/reducers/moments_spec.rb +2 -2
  63. data/spec/wukong/widget/reducers/sort_spec.rb +1 -1
  64. data/spec/wukong/widget/serializers_spec.rb +84 -88
  65. data/spec/wukong/wu-local_spec.rb +109 -0
  66. metadata +43 -20
  67. data/bin/wu-server +0 -70
  68. data/lib/wukong/boot.rb +0 -96
  69. data/lib/wukong/configuration.rb +0 -8
  70. data/lib/wukong/emitter.rb +0 -22
  71. data/lib/wukong/server.rb +0 -119
  72. data/lib/wukong/spec_helpers/integration_driver.rb +0 -157
  73. data/lib/wukong/spec_helpers/processor_helpers.rb +0 -89
  74. data/lib/wukong/spec_helpers/spec_driver.rb +0 -28
  75. data/spec/wukong/local_runner_spec.rb +0 -31
  76. data/spec/wukong/wu_local_spec.rb +0 -125
@@ -1,119 +0,0 @@
1
- module Wukong
2
- module EventMachineServer
3
- include DriverMethods
4
-
5
- def self.included klass
6
- klass.class_eval do
7
- attr_accessor :dataflow, :settings
8
-
9
- def self.add_signal_traps
10
- Signal.trap('INT') { log.info 'Received SIGINT. Stopping.' ; EM.stop }
11
- Signal.trap('TERM') { log.info 'Received SIGTERM. Stopping.' ; EM.stop }
12
- end
13
- end
14
- end
15
-
16
- def initialize(label, settings)
17
- super
18
- @settings = settings
19
- @dataflow = construct_dataflow(label, settings)
20
- end
21
-
22
- end
23
-
24
- class StdioServer < EM::P::LineAndTextProtocol
25
- include EventMachineServer
26
- include Processor::StdoutProcessor
27
- include Logging
28
-
29
- def self.start(label, settings = {})
30
- EM.attach($stdin, self, label, settings)
31
- end
32
-
33
- def post_init
34
- self.class.add_signal_traps
35
- setup_dataflow
36
- end
37
-
38
- def receive_line line
39
- driver.send_through_dataflow(line)
40
- end
41
-
42
- def unbind
43
- finalize_and_stop_dataflow
44
- EM.stop
45
- end
46
-
47
- end
48
-
49
- class TCPServer < EM::P::LineAndTextProtocol
50
- include EventMachineServer
51
- include Processor::BufferedProcessor
52
- include Logging
53
-
54
- def self.start(label, settings = {})
55
- host = settings[:host] || Socket.gethostname
56
- port = settings[:port] || 9000
57
- EM.start_server(host, port, self, label, settings)
58
- log.info "Server started on #{host} on port #{port}"
59
- add_signal_traps
60
- end
61
-
62
- def post_init
63
- port, ip = Socket.unpack_sockaddr_in(get_peername)
64
- log.info "Connected to #{ip} on #{port}"
65
- setup_dataflow
66
- end
67
-
68
- def receive_line line
69
- @buffer = []
70
- operation = proc { driver.send_through_dataflow(line) }
71
- callback = proc { flush_buffer @buffer }
72
- EM.defer(operation, callback)
73
- end
74
-
75
- def flush_buffer records
76
- send_data(records.join("\n") + "\n")
77
- records.clear
78
- end
79
-
80
- def unbind
81
- EM.stop
82
- end
83
-
84
- end
85
- end
86
-
87
- class StupidServer
88
-
89
- attr_accessor :dataflow, :settings
90
-
91
- def initialize(label, settings)
92
- @settings = settings
93
- builder = (Wukong.registry.retrieve(label.to_sym) or raise Wukong::Error.new("No such processor or dataflow: #{label}"))
94
- dataflow = builder.build(settings)
95
- @dataflow = dataflow.respond_to?(:stages) ? dataflow.directed_sort.map{ |name| dataflow.stages[name] } : [ dataflow ]
96
- @dataflow << self
97
- end
98
-
99
- def dumb_driver
100
- @dumb_driver ||= Wukong::Driver.new(dataflow)
101
- end
102
-
103
- def stop() ; end
104
- def setup() ; end
105
- def process(record) $stdout.puts record ; end
106
-
107
- def run!
108
- dataflow.each(&:setup)
109
-
110
- while line = $stdin.readline.chomp rescue nil do
111
- dumb_driver.send_through_dataflow(line)
112
- end
113
- dataflow.each do |stage|
114
- stage.finalize(&dumb_driver.advance(stage)) if stage.respond_to?(:finalize)
115
- stage.stop
116
- end
117
-
118
- end
119
- end
@@ -1,157 +0,0 @@
1
- require 'open3'
2
-
3
- module Wukong
4
- module SpecHelpers
5
-
6
- # Provides a `command` method for writing integration tests for
7
- # commands.
8
- module IntegrationRunner
9
-
10
- # Spawn a command and capture its STDOUT, STDERR, and exit code.
11
- #
12
- # The `args` will be joined together into a command line.
13
- #
14
- # It is expected that you will use the matchers defined in
15
- # IntegrationMatchers in your integration tests:
16
- #
17
- # @example Check output of 'ls' includes a string 'foo.txt'
18
- # it "lists files" do
19
- # command('ls').should have_output('foo.txt')
20
- # end
21
- #
22
- # @example More complicated
23
- # context "long format" do
24
- # it "lists files with timestamps" do
25
- # command('ls', '-l').should have_output('foo.txt', /\w+ \d+ \d+:\d+/)
26
- # end
27
- # end
28
- #
29
- # @param [Array<String>] args
30
- #
31
- # @overload command(*args, options={})
32
- # If the last element of `args` is a Hash it will be used for
33
- # options.
34
- #
35
- # The :env option specifies the command line environment to
36
- # use for the command. By default this will be the value of
37
- # the Ruby process's own `ENV` variable. If running in a
38
- # context in which the `integration_env` method is defined,
39
- # its return value will be merged on top of `ENV`. An
40
- # explicitly provided :env option will again be merged on top.
41
- #
42
- # The :cwd option specifies the working directory to start in.
43
- # It defaults to the value of <tt>Dir.pwd</tt>
44
- #
45
- # @param [Array<String>] args
46
- # @param [Hash] options
47
- # @option options [Hash] env the shell environment to spawn the command with
48
- # @option options [Hash] cwd the directory to execute the command in
49
- def command *args
50
- a = args.flatten.compact
51
- options = (a.last.is_a?(Hash) ? a.pop : {})
52
-
53
- env = ENV.to_hash.dup
54
- env.merge!(integration_env) if respond_to?(:integration_env)
55
- env.merge!(options[:env] || {})
56
-
57
- cwd = options[:cwd]
58
- cwd ||= (respond_to?(:integration_cwd) ? integration_cwd : Dir.pwd)
59
-
60
- IntegrationDriver.new(a, cwd: cwd, env: env)
61
- end
62
- end
63
-
64
- # A driver for running commands in a subprocess.
65
- class IntegrationDriver
66
-
67
- # The command to execute
68
- attr_accessor :cmd
69
-
70
- # The directory in which to execute the command.
71
- attr_accessor :cwd
72
-
73
- # The ID of the spawned subprocess (while it was running).
74
- attr_accessor :pid
75
-
76
- # The STDOUT of the spawned process.
77
- attr_accessor :stdout
78
-
79
- # The STDERR of the spawned process.
80
- attr_accessor :stderr
81
-
82
- # The exit code of the spawned process.
83
- attr_accessor :exit_code
84
-
85
- # Run the command and capture its outputs and exit code.
86
- #
87
- # @return [true, false]
88
- def run!
89
- return false if ran?
90
- FileUtils.cd(cwd) do
91
- Open3.popen3(env, cmd) do |i, o, e, wait_thr|
92
- self.pid = wait_thr.pid
93
-
94
- @inputs.each { |input| i.puts(input) }
95
- i.close
96
-
97
- self.stdout = o.read
98
- self.stderr = e.read
99
- self.exit_code = wait_thr.value.to_i
100
- end
101
- end
102
- @ran = true
103
- end
104
-
105
- # Initialize a new IntegrationDriver to run a given command.
106
- def initialize args, options
107
- @args = args
108
- @env = options[:env]
109
- @cwd = options[:cwd]
110
- @inputs = []
111
- end
112
-
113
- def cmd
114
- @args.compact.map(&:to_s).join(' ')
115
- end
116
-
117
- def on *events
118
- @inputs.concat(events)
119
- self
120
- end
121
- alias_method :<, :on
122
-
123
- def in dir
124
- @cwd = dir
125
- self
126
- end
127
-
128
- def using env
129
- @env = env
130
- self
131
- end
132
-
133
- def env
134
- ENV.to_hash.merge(@env || {})
135
- end
136
-
137
- def ran?
138
- @ran
139
- end
140
-
141
- def cmd_summary
142
- [
143
- cmd,
144
- "with env #{env_summary}",
145
- "in dir #{cwd}"
146
- ].join("\n")
147
- end
148
-
149
- def env_summary
150
- { "PATH" => env["PATH"], "RUBYLIB" => env["RUBYLIB"] }.inspect
151
- end
152
-
153
- end
154
- end
155
- end
156
-
157
-
@@ -1,89 +0,0 @@
1
- module Wukong
2
- module SpecHelpers
3
- module ProcessorHelpers
4
-
5
- # Creates a new processor in a variety of convenient ways.
6
- #
7
- # Most simply, called without args, will return a new instance of
8
- # a the klass named in the containing `describe` or `context`:
9
- #
10
- # context MyApp::Tokenizer do
11
- # it "uses whitespace as the default separator between tokens" do
12
- # processor.separator.should == /\s+/
13
- # end
14
- # end
15
- #
16
- # if your processor has been registered (you created it with the
17
- # <tt>Wukong.processor</tt> helper method or otherwise
18
- # registered it yourself) then you can use its name:
19
- #
20
- # context :tokenizer do
21
- # it "uses whitespace as the default separator between tokens" do
22
- # processor.separator.should == /\s+/
23
- # end
24
- # end
25
- #
26
- # The `processor` method can also be used inside RSpec's
27
- # `subject` and `let` methods:
28
- #
29
- # context "with no arguments" do
30
- # subject { processor }
31
- # it "uses whitespace as the default separator between tokens" do
32
- # separator.should == /\s+/
33
- # end
34
- # end
35
- # end
36
- #
37
- # and you can easily pass arguments, just like you would on the
38
- # command line or in a dataflow definition:
39
- #
40
- # context "with arguments" do
41
- # subject { processor(separator: ' ') }
42
- # it "uses whitespace as the default separator between tokens" do
43
- # separator.should == ' '
44
- # end
45
- # end
46
- # end
47
- #
48
- # You can even name the processor directly if you want to:
49
- #
50
- # context "tokenizers" do
51
- # let(:default_tokenizer) { processor(:tokenizer) }
52
- # let(:complex_tokenizer) { processor(:complex_tokenizer, stemming: true) }
53
- # let(:french_tokenizer) { processor(:complex_tokenizer, stemming: true) }
54
- # ...
55
- # end
56
- def processor *args, &block
57
- options = args.extract_options!
58
- name = args.first || self.class.description
59
- create_processor(name, options, &block)
60
- end
61
- alias_method :flow, :processor
62
-
63
- # Is the given +klass+ a Wukong::Processor?
64
- #
65
- # @param [Class] klass
66
- # @return [true, false]
67
- def processor? klass
68
- klass.build.is_a?(Processor)
69
- end
70
-
71
- # :nodoc:
72
- def create_processor name_or_klass, options={}, &block
73
- if name_or_klass.is_a?(Class)
74
- klass = name_or_klass
75
- else
76
- klass = Wukong.registry.retrieve(name_or_klass.to_s.to_sym)
77
- raise Error.new("Could not find a Wukong::Processor class named '#{name_or_klass}'") if klass.nil?
78
- end
79
- raise Error.new("#{klass} is not a subclass of Wukong::Processor") unless processor?(klass)
80
- settings = Configliere::Param.new
81
- Wukong.boot!(settings)
82
- proc = klass.build(settings.merge(options))
83
- proc.setup
84
- proc.instance_eval(&block) if block_given?
85
- proc
86
- end
87
- end
88
- end
89
- end
@@ -1,28 +0,0 @@
1
- module Wukong
2
- module SpecHelpers
3
- class SpecDriver < Array
4
-
5
- attr_reader :processor
6
-
7
- def initialize processor
8
- super()
9
- @processor = processor
10
- end
11
-
12
- def run
13
- return false unless processor
14
- processor.given_records.each do |input|
15
- processor.process(input) do |output|
16
- self << output
17
- end
18
- end
19
- processor.finalize do |output|
20
- self << output
21
- end
22
- processor.stop
23
- self
24
- end
25
-
26
- end
27
- end
28
- end
@@ -1,31 +0,0 @@
1
- require 'spec_helper'
2
- # require 'wukong'
3
- # require 'wukong/local_runner'
4
-
5
- # describe Wukong::LocalRunner, :examples_spec => true, :helpers => true do
6
-
7
- # context 'examples' do
8
-
9
- # subject{
10
- # test_sink = test_sink()
11
- # Wukong.dataflow(:integers) do
12
- # input :default, Wukong::Source::Integers.new(:size => 100)
13
- # output :default, test_sink
14
-
15
- # input(:default) >
16
- # map(&:to_s) >
17
- # re(/..+/) >
18
- # map(&:reverse) >
19
- # limit(20) >
20
- # output(:default)
21
- # end
22
- # Wukong::LocalRunner.receive(:flow => Wukong.dataflow(:integers))
23
- # }
24
-
25
- # it 'runs' do
26
- # subject.run(:default)
27
- # subject.flow.output(:default).records.should == %w[01 11 21 31 41 51 61 71 81 91 02 12 22 32 42 52 62 72 82 92]
28
- # end
29
-
30
- # end
31
- # end
@@ -1,125 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe 'wu-local' do
4
-
5
- let(:input) { %w[1 2 3] }
6
-
7
- context "without any arguments" do
8
- subject { command('wu-local') }
9
- it {should exit_with(:non_zero) }
10
- it "displays help on STDERR" do
11
- should have_stderr("usage: wu-local")
12
- end
13
- end
14
-
15
- context "running outside any Ruby project" do
16
- subject { command('wu-local count').in(examples_dir('empty')) < input }
17
- it { should exit_with(0) }
18
- it "runs the processor" do
19
- should have_stdout("3")
20
- end
21
- context "when passed a BUNDLE_GEMFILE" do
22
- context "that doesn't belong to a deploy pack" do
23
- subject { command('wu-local count').in(examples_dir('empty')).using(integration_env.merge("BUNDLE_GEMFILE" => examples_dir('ruby_project', 'Gemfile').to_s)) < input }
24
- it { should exit_with(0) }
25
- it "runs the processor" do
26
- should have_stdout("3")
27
- end
28
- end
29
- context "that belongs to a deploy pack" do
30
- subject { command('wu-local count').in(examples_dir('empty')).using(integration_env.merge("BUNDLE_GEMFILE" => examples_dir('deploy_pack', 'Gemfile').to_s)) < input }
31
- it { should exit_with(0) }
32
- it "runs the processor" do
33
- should have_stdout("3")
34
- end
35
- context "loading the deploy pack" do
36
- subject { command('wu-local string_reverser').in(examples_dir('empty')).using(integration_env.merge("BUNDLE_GEMFILE" => examples_dir('deploy_pack', 'Gemfile').to_s)) < 'hi' }
37
- it { should exit_with(0) }
38
- it "runs the processor" do
39
- should have_stdout("ih")
40
- end
41
- end
42
- end
43
- end
44
- end
45
-
46
- context "running within a Ruby project" do
47
- context "at its root" do
48
- subject { command('wu-local count').in(examples_dir('ruby_project')) < input }
49
- it { should exit_with(0) }
50
- it "runs the processor" do
51
- should have_stdout("3")
52
- end
53
- end
54
- context "deep within it" do
55
- subject { command('wu-local count').in(examples_dir('ruby_project')) < input }
56
- it { should exit_with(0) }
57
- it "runs the processor" do
58
- should have_stdout("3")
59
- end
60
- end
61
- end
62
-
63
- context "running within a deploy pack" do
64
- context "at its root" do
65
- subject { command('wu-local count').in(examples_dir('deploy_pack')) < input }
66
- it { should exit_with(0) }
67
- it "runs the processor" do
68
- should have_stdout("3")
69
- end
70
- context "loading the deploy pack" do
71
- subject { command('wu-local string_reverser').in(examples_dir('deploy_pack')) < 'hi' }
72
- it { should exit_with(0) }
73
- it "runs the processor" do
74
- should have_stdout("ih")
75
- end
76
- end
77
- end
78
- context "deep within it" do
79
- subject { command('wu-local count').in(examples_dir('deploy_pack')) < input }
80
- it { should exit_with(0) }
81
- it "runs the processor" do
82
- should have_stdout("3")
83
- end
84
- context "loading the deploy pack" do
85
- subject { command('wu-local string_reverser').in(examples_dir('deploy_pack')) < 'hi' }
86
- it { should exit_with(0) }
87
- it "runs the processor" do
88
- should have_stdout("ih")
89
- end
90
- end
91
- end
92
- end
93
-
94
- # context "running within a deploy pack" do
95
- # context "at its root" do
96
- # let(:subject) { command('wu-local', :cwd => examples_dir('deploy_pack')) }
97
- # end
98
- # context "deep within it" do
99
- # let(:subject) { command('wu-local', :cwd => examples_dir('deploy_pack', 'a','b','c')) }
100
- # end
101
- # end
102
-
103
- # context "in local mode" do
104
- # context "on a map-only job" do
105
- # let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
106
- # it { should exit_with(0) }
107
- # it { should have_stdout('Shall', 'I', 'compare', 'thee', 'to', 'a', "summer's", 'day') }
108
- # end
109
-
110
- # context "on a map-reduce job" do
111
- # let(:subject) { command('wu-hadoop', example_script('word_count.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
112
- # it { should exit_with(0) }
113
- # it { should have_stdout(/complexion\s+1/, /Death\s+1/, /temperate\s+1/) }
114
- # end
115
- # end
116
-
117
- # context "in Hadoop mode" do
118
- # context "on a map-only job" do
119
- # let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=hadoop", "--input=/data/in", "--output=/data/out", "--dry_run") }
120
- # it { should exit_with(0) }
121
- # it { should have_stdout(%r{jar.*hadoop.*streaming.*\.jar}, %r{-mapper.+tokenizer\.rb}, %r{-input.*/data/in}, %r{-output.*/data/out}) }
122
- # end
123
- # end
124
-
125
- end