dynflow 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ee35eec200e14b25add8941b4b4637e994012053a271898c4abc4a70234942e
4
- data.tar.gz: 720fc9161e5aadff8f165c12f6bb278cfa65435ef7a1d96d93df8614d21a1de8
3
+ metadata.gz: '0678aaa84d4b56ba08bbd68a6e034076f22b9f44bf85110f34a9150403490243'
4
+ data.tar.gz: c91ee787f316b332f8e802557ed31aa6490db2b62d82b595e3397549f5596aef
5
5
  SHA512:
6
- metadata.gz: 1753d21be5307643a16704a27b0e343fe55a7e74330469d1eece503d58cfa9fd04be1da57918e6e09e73f7e2822f2be2bd517e9eea58b89432bee7226d51ab93
7
- data.tar.gz: ac12ce027be3e289227db4a2b5311328342de364c0f5d51f5badc27e7ba484a6f137488ce681d319e050b1a04ef55ac4821d0d87a06d6d70019f3b2c6e889fca
6
+ metadata.gz: '003039069ca213958db31c0dcb6d158c8ab675182881ac4f4edfbafbbec826ca36fe51ce606bbd2621226851ac72bde934995d4aae36a6da43410ee573917e27'
7
+ data.tar.gz: 5321596fe890b8398f3e207c39991a72febf76f6238bc4c5c91df1634e66ded7149574624846fead73d52270a089a429928bbd8b3d4f6f33318ad6c880180f95
@@ -20,3 +20,7 @@ Style/MultilineOperationIndentation:
20
20
  # Cop supports --auto-correct.
21
21
  Style/EmptyLines:
22
22
  Enabled: true
23
+
24
+ Metrics/ModuleLength:
25
+ Exclude:
26
+ - test/**/*
data/Gemfile CHANGED
@@ -41,3 +41,7 @@ group :rails do
41
41
  gem 'rails', '>= 4.2.9'
42
42
  gem 'logging'
43
43
  end
44
+
45
+ group :telemetry do
46
+ gem 'statsd-instrument'
47
+ end
@@ -17,6 +17,7 @@ class ExampleHelper
17
17
  config.persistence_adapter = persistence_adapter
18
18
  config.logger_adapter = logger_adapter
19
19
  config.auto_rescue = false
20
+ config.telemetry_adapter = telemetry_adapter
20
21
  yield config if block_given?
21
22
  Dynflow::World.new(config).tap do |world|
22
23
  puts "World #{world.id} started..."
@@ -27,6 +28,14 @@ class ExampleHelper
27
28
  ENV['DB_CONN_STRING'] || 'sqlite:/'
28
29
  end
29
30
 
31
+ def telemetry_adapter
32
+ if (host = ENV['TELEMETRY_STATSD_HOST'])
33
+ Dynflow::TelemetryAdapters::StatsD.new host
34
+ else
35
+ Dynflow::TelemetryAdapters::Dummy.new
36
+ end
37
+ end
38
+
30
39
  def persistence_adapter
31
40
  Dynflow::PersistenceAdapters::Sequel.new persistence_conn_string
32
41
  end
@@ -0,0 +1,65 @@
1
+ ---
2
+ mappings:
3
+ - name: dynflow_active_execution_plans
4
+ match: dynflow_active_execution_plans.*.*.*
5
+ labels:
6
+ action: "$1"
7
+ world: "$2"
8
+ state: "$3"
9
+ help: The number of active execution plans
10
+ - name: dynflow_active_workers
11
+ match: dynflow_active_workers.*.*
12
+ labels:
13
+ queue: "$1"
14
+ world: "$2"
15
+ help: The number of currently busy workers
16
+ - name: dynflow_queue_size
17
+ match: dynflow_queue_size.*.*
18
+ labels:
19
+ queue: "$1"
20
+ world: "$2"
21
+ help: The number of events in the queue
22
+ - name: dynflow_connector_envelopes
23
+ match: dynflow_connector_envelopes.*.*
24
+ labels:
25
+ world: "$1"
26
+ direction: "$2"
27
+ help: The number of envelopes handled by a connector
28
+ - name: dynflow_finished_execution_plans
29
+ match: dynflow_finished_execution_plans.*.*.*
30
+ labels:
31
+ action: "$1"
32
+ world: "$2"
33
+ result: "$3"
34
+ help: The number of execution plans
35
+ - name: dynflow_step_execution_time
36
+ match: dynflow_step_execution_time.*.*
37
+ labels:
38
+ action: "$1"
39
+ phase: "$2"
40
+ help: The time spent executing a step
41
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
42
+ timer_type: histogram
43
+ - name: dynflow_step_real_time
44
+ match: dynflow_step_real_time.*.*
45
+ labels:
46
+ action: "$1"
47
+ phase: "$2"
48
+ help: The time between the start end end of the step
49
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
50
+ timer_type: histogram
51
+ - name: dynflow_worker_events
52
+ match: dynflow_worker_events.*.*.*
53
+ labels:
54
+ queue: "$1"
55
+ world: "$2"
56
+ worker: "$3"
57
+ help: The number of processed events
58
+ - name: dynflow_persistence
59
+ match: dynflow_persistence.*.*
60
+ labels:
61
+ world: "$1"
62
+ method: "$2"
63
+ help: The time spent communicating with the database
64
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
65
+ timer_type: histogram
@@ -50,6 +50,7 @@ module Dynflow
50
50
  require 'dynflow/delayed_executors'
51
51
  require 'dynflow/semaphores'
52
52
  require 'dynflow/throttle_limiter'
53
+ require 'dynflow/telemetry'
53
54
  require 'dynflow/config'
54
55
 
55
56
  if defined? ::ActiveJob
@@ -511,6 +511,7 @@ module Dynflow
511
511
  end
512
512
  end
513
513
 
514
+ # TODO: This is getting out of hand, refactoring needed
514
515
  def execute_run(event)
515
516
  phase! Run
516
517
  @world.logger.debug format('%13s %s:%2d got event %s',
@@ -544,7 +545,6 @@ module Dynflow
544
545
 
545
546
  check_serializable :output
546
547
  end
547
-
548
548
  else
549
549
  raise "wrong state #{state} when event:#{event}"
550
550
  end
@@ -45,6 +45,10 @@ module Dynflow
45
45
  def label
46
46
  input[:job_class]
47
47
  end
48
+
49
+ def rescue_strategy
50
+ Action::Rescue::Skip
51
+ end
48
52
  end
49
53
  end
50
54
  end
@@ -182,6 +182,10 @@ module Dynflow
182
182
  './backup'
183
183
  end
184
184
 
185
+ config_attr :telemetry_adapter, ::Dynflow::TelemetryAdapters::Abstract do |world|
186
+ ::Dynflow::TelemetryAdapters::Dummy.new
187
+ end
188
+
185
189
  def validate(config_for_world)
186
190
  if defined? ::ActiveRecord::Base
187
191
  begin
@@ -8,7 +8,11 @@ module Dynflow
8
8
  raise NotImplementedError
9
9
  end
10
10
 
11
- def stop_listening(world)
11
+ def stop_receiving_new_work(_, timeout = nil)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def stop_listening(world, timeout = nil)
12
16
  raise NotImplementedError
13
17
  end
14
18
 
@@ -24,6 +28,7 @@ module Dynflow
24
28
  # between words: we need to know the one to send the message to
25
29
  def receive(world, envelope)
26
30
  Type! envelope, Dispatcher::Envelope
31
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => world.id, :direction => 'incoming') }
27
32
  match(envelope.message,
28
33
  (on Dispatcher::Ping do
29
34
  response_envelope = envelope.build_response_envelope(Dispatcher::Pong, world)
@@ -159,15 +159,16 @@ module Dynflow
159
159
  @core.ask([:start_listening, world])
160
160
  end
161
161
 
162
- def stop_receiving_new_work(_)
163
- @core.ask(:stop_receiving_new_work).wait
162
+ def stop_receiving_new_work(_, timeout = nil)
163
+ @core.ask(:stop_receiving_new_work).wait(timeout)
164
164
  end
165
165
 
166
- def stop_listening(_)
167
- @core.ask(:stop_listening).then { @core.ask(:terminate!) }.wait
166
+ def stop_listening(_, timeout = nil)
167
+ @core.ask(:stop_listening).then { @core.ask(:terminate!) }.wait(timeout)
168
168
  end
169
169
 
170
170
  def send(envelope)
171
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => envelope.sender_id, :direction => 'outgoing') }
171
172
  @core.ask([:handle_envelope, envelope])
172
173
  end
173
174
  end
@@ -55,15 +55,16 @@ module Dynflow
55
55
  @core.ask([:start_listening, world])
56
56
  end
57
57
 
58
- def stop_receiving_new_work(world)
59
- @core.ask([:stop_receiving_new_work, world]).wait
58
+ def stop_receiving_new_work(world, timeout = nil)
59
+ @core.ask([:stop_receiving_new_work, world]).wait(timeout)
60
60
  end
61
61
 
62
- def stop_listening(world)
63
- @core.ask([:stop_listening, world]).wait
62
+ def stop_listening(world, timeout = nil)
63
+ @core.ask([:stop_listening, world]).wait(timeout)
64
64
  end
65
65
 
66
66
  def send(envelope)
67
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => envelope.sender_id) }
67
68
  @core.ask([:handle_envelope, envelope])
68
69
  end
69
70
  end
@@ -9,8 +9,12 @@ module Dynflow
9
9
 
10
10
  def create_record(record)
11
11
  @sequel_adapter.insert_coordinator_record(record.to_hash)
12
- rescue ::Sequel::UniqueConstraintViolation
13
- raise Coordinator::DuplicateRecordError.new(record)
12
+ rescue Errors::PersistenceError => e
13
+ if e.cause.is_a? ::Sequel::UniqueConstraintViolation
14
+ raise Coordinator::DuplicateRecordError.new(record)
15
+ else
16
+ raise e
17
+ end
14
18
  end
15
19
 
16
20
  def update_record(record)
@@ -0,0 +1,42 @@
1
+ module Dynflow
2
+ module Debug
3
+ module Telemetry
4
+ module Persistence
5
+ methods = [
6
+ :load_action,
7
+ :load_actions,
8
+ :load_action_for_presentation,
9
+ :load_action,
10
+ :load_actions,
11
+ :load_action_for_presentation,
12
+ :load_actions_attributes,
13
+ :save_action,
14
+ :find_execution_plans,
15
+ :find_execution_plan_counts,
16
+ :delete_execution_plans,
17
+ :load_execution_plan,
18
+ :save_execution_plan,
19
+ :find_old_execution_plans,
20
+ :find_past_delayed_plans,
21
+ :delete_delayed_plans,
22
+ :save_delayed_plan,
23
+ :set_delayed_plan_frozen,
24
+ :load_delayed_plan,
25
+ :load_step,
26
+ :load_steps,
27
+ :save_step,
28
+ :push_envelope,
29
+ :pull_envelopes
30
+ ]
31
+
32
+ methods.each do |name|
33
+ define_method(name) do |*args|
34
+ Dynflow::Telemetry.measure(:dynflow_persistence, :method => name, :world => @world.id) { super *args }
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ ::Dynflow::Persistence.send(:prepend, ::Dynflow::Debug::Persistence)
@@ -84,6 +84,10 @@ module Dynflow
84
84
  @rescued_steps = {}
85
85
  end
86
86
 
87
+ def current_execution_plan_ids
88
+ @execution_plan_managers.keys
89
+ end
90
+
87
91
  def start_execution(execution_plan_id, finished)
88
92
  manager = track_execution_plan(execution_plan_id, finished)
89
93
  return [] unless manager
@@ -108,6 +112,17 @@ module Dynflow
108
112
  unless_done(manager, manager.what_is_next(work))
109
113
  end
110
114
 
115
+ # called when there was an unhandled exception during the execution
116
+ # of the work (such as persistence issue) - in this case we just clean up the
117
+ # runtime from the execution plan and let it go (common cause for this is the execution
118
+ # plan being removed from database by external user)
119
+ def work_failed(work)
120
+ if (manager = @execution_plan_managers[work.execution_plan_id])
121
+ manager.terminate
122
+ finish_manager(manager)
123
+ end
124
+ end
125
+
111
126
  def terminate
112
127
  unless @execution_plan_managers.empty?
113
128
  logger.error "... cleaning #{@execution_plan_managers.size} execution plans ..."
@@ -34,6 +34,7 @@ module Dynflow
34
34
  class DataConsistencyError < Dynflow::Error
35
35
  end
36
36
 
37
+ # any persistence errors
37
38
  class PersistenceError < Dynflow::Error
38
39
  def self.delegate(original_exception)
39
40
  self.new("caused by #{original_exception.class}: #{original_exception.message}").tap do |e|
@@ -41,5 +42,9 @@ module Dynflow
41
42
  end
42
43
  end
43
44
  end
45
+
46
+ # persistence errors that can't be recovered from, such as continuous connection issues
47
+ class FatalPersistenceError < PersistenceError
48
+ end
44
49
  end
45
50
  end
@@ -120,7 +120,12 @@ module Dynflow
120
120
  @ended_at = Time.now
121
121
  @real_time = @ended_at - @started_at unless @started_at.nil?
122
122
  @execution_time = compute_execution_time
123
- hooks_to_run << (failure? ? :failure : :success)
123
+ key = failure? ? :failure : :success
124
+ Dynflow::Telemetry.with_instance do |t|
125
+ t.increment_counter(:dynflow_finished_execution_plans, 1,
126
+ telemetry_common_options.merge(:result => key.to_s))
127
+ end
128
+ hooks_to_run << key
124
129
  unlock_all_singleton_locks!
125
130
  when :paused
126
131
  unlock_all_singleton_locks!
@@ -130,6 +135,8 @@ module Dynflow
130
135
  logger.debug format('%13s %s %9s >> %9s',
131
136
  'ExecutionPlan', id, original, state)
132
137
  self.save
138
+ toggle_telemetry_state original == :pending ? nil : original.to_s,
139
+ self.state == :stopped ? nil : self.state.to_s
133
140
  hooks_to_run.each { |kind| run_hooks kind }
134
141
  end
135
142
 
@@ -548,6 +555,21 @@ module Dynflow
548
555
  end
549
556
  end
550
557
 
558
+ def toggle_telemetry_state(original, new)
559
+ return if original == new
560
+ @label = root_plan_step.action_class if @label.nil?
561
+ Dynflow::Telemetry.with_instance do |t|
562
+ t.set_gauge(:dynflow_active_execution_plans, '-1',
563
+ telemetry_common_options.merge(:state => original)) unless original.nil?
564
+ t.set_gauge(:dynflow_active_execution_plans, '+1',
565
+ telemetry_common_options.merge(:state => new)) unless new.nil?
566
+ end
567
+ end
568
+
569
+ def telemetry_common_options
570
+ { :world => @world.id, :action => @label }
571
+ end
572
+
551
573
  private_class_method :steps_from_hash
552
574
  end
553
575
  # rubocop:enable Metrics/ClassLength
@@ -162,9 +162,11 @@ module Dynflow
162
162
  block.call
163
163
  ensure
164
164
  calculate_progress(action)
165
- @ended_at = Time.now
166
- @execution_time += @ended_at - start
165
+ @ended_at = Time.now
166
+ current_execution_time = @ended_at - start
167
+ @execution_time += current_execution_time
167
168
  @real_time = @ended_at - @started_at
169
+ update_step_telemetry(current_execution_time)
168
170
  end
169
171
 
170
172
  def calculate_progress(action)
@@ -174,6 +176,19 @@ module Dynflow
174
176
  @progress_done = 0
175
177
  end
176
178
  end
179
+
180
+ def update_step_telemetry(current_execution_time)
181
+ Dynflow::Telemetry.with_instance do |t|
182
+ if [:success, :skipped].include?(state)
183
+ t.observe_histogram(:dynflow_step_real_time,
184
+ real_time * 1000,
185
+ :action => action_class.to_s, :phase => phase.to_s_humanized)
186
+ end
187
+ t.observe_histogram(:dynflow_step_execution_time,
188
+ current_execution_time * 1000,
189
+ :action => action_class.to_s, :phase => phase.to_s_humanized)
190
+ end
191
+ end
177
192
  end
178
193
  end
179
194
  end
@@ -21,8 +21,8 @@ module Dynflow
21
21
  default_pool_size = @queues_options[:default][:pool_size]
22
22
  @queues_options.each do |(queue_name, queue_options)|
23
23
  queue_pool_size = queue_options.fetch(:pool_size, default_pool_size)
24
- @pools[queue_name] = Pool.spawn("pool #{queue_name}", reference,
25
- queue_name, queue_pool_size,
24
+ @pools[queue_name] = Pool.spawn("pool #{queue_name}", @world,
25
+ reference, queue_name, queue_pool_size,
26
26
  @world.transaction_adapter)
27
27
  end
28
28
  end
@@ -49,10 +49,14 @@ module Dynflow
49
49
  feed_pool(@director.work_finished(work))
50
50
  end
51
51
 
52
- def handle_persistence_error(error)
53
- logger.fatal "PersistenceError in executor: terminating"
54
- logger.fatal error
55
- @world.terminate
52
+ def handle_persistence_error(error, work = nil)
53
+ logger.error "PersistenceError in executor"
54
+ logger.error error
55
+ @director.work_failed(work) if work
56
+ if error.is_a? Errors::FatalPersistenceError
57
+ logger.fatal "Terminating"
58
+ @world.terminate
59
+ end
56
60
  end
57
61
 
58
62
  def start_termination(*args)
@@ -66,7 +70,7 @@ module Dynflow
66
70
  # we expect this message from all worker pools
67
71
  return unless @pools.empty?
68
72
  @director.terminate
69
- logger.error '... core terminated.'
73
+ logger.info '... Dynflow core terminated.'
70
74
  super()
71
75
  end
72
76
 
@@ -19,6 +19,10 @@ module Dynflow
19
19
  @jobs[execution_plan_id].shift.tap { delete execution_plan_id if @jobs[execution_plan_id].empty? }
20
20
  end
21
21
 
22
+ def queue_size
23
+ execution_status.values.reduce(0, :+)
24
+ end
25
+
22
26
  def empty?
23
27
  @jobs.empty?
24
28
  end
@@ -46,27 +50,35 @@ module Dynflow
46
50
  end
47
51
  end
48
52
 
49
- def initialize(core, name, pool_size, transaction_adapter)
53
+ def initialize(world, core, name, pool_size, transaction_adapter)
54
+ @world = world
50
55
  @name = name
51
56
  @executor_core = core
52
57
  @pool_size = pool_size
53
- @free_workers = Array.new(pool_size) { |i| Worker.spawn("worker-#{i}", reference, transaction_adapter) }
54
58
  @jobs = JobStorage.new
59
+ @free_workers = Array.new(pool_size) do |i|
60
+ name = "worker-#{i}"
61
+ Worker.spawn(name, reference, transaction_adapter, telemetry_options.merge(:worker => name))
62
+ end
55
63
  end
56
64
 
57
65
  def schedule_work(work)
58
66
  @jobs.add work
59
67
  distribute_jobs
68
+ update_telemetry
60
69
  end
61
70
 
62
71
  def worker_done(worker, work)
63
72
  @executor_core.tell([:work_finished, work])
64
73
  @free_workers << worker
74
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_active_workers, -1, telemetry_options) }
65
75
  distribute_jobs
66
76
  end
67
77
 
68
- def handle_persistence_error(error)
69
- @executor_core.tell([:handle_persistence_error, error])
78
+ def handle_persistence_error(worker, error, work = nil)
79
+ @executor_core.tell([:handle_persistence_error, error, work])
80
+ @free_workers << worker
81
+ distribute_jobs
70
82
  end
71
83
 
72
84
  def start_termination(*args)
@@ -92,7 +104,19 @@ module Dynflow
92
104
 
93
105
  def distribute_jobs
94
106
  try_to_terminate
95
- @free_workers.pop << @jobs.pop until @free_workers.empty? || @jobs.empty?
107
+ until @free_workers.empty? || @jobs.empty?
108
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_active_workers, '+1', telemetry_options) }
109
+ @free_workers.pop << @jobs.pop
110
+ update_telemetry
111
+ end
112
+ end
113
+
114
+ def telemetry_options
115
+ { :queue => @name.to_s, :world => @world.id }
116
+ end
117
+
118
+ def update_telemetry
119
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_queue_size, @jobs.queue_size, telemetry_options) }
96
120
  end
97
121
  end
98
122
  end
@@ -2,19 +2,23 @@ module Dynflow
2
2
  module Executors
3
3
  class Parallel < Abstract
4
4
  class Worker < Actor
5
- def initialize(pool, transaction_adapter)
5
+ def initialize(pool, transaction_adapter, telemetry_options = {})
6
6
  @pool = Type! pool, Concurrent::Actor::Reference
7
7
  @transaction_adapter = Type! transaction_adapter, TransactionAdapters::Abstract
8
+ @telemetry_options = telemetry_options
8
9
  end
9
10
 
10
11
  def on_message(work_item)
12
+ already_responded = false
11
13
  Executors.run_user_code do
12
14
  work_item.execute
13
15
  end
14
16
  rescue Errors::PersistenceError => e
15
- @pool.tell([:handle_persistence_error, e])
17
+ @pool.tell([:handle_persistence_error, reference, e, work_item])
18
+ already_responded = true
16
19
  ensure
17
- @pool.tell([:worker_done, reference, work_item])
20
+ Dynflow::Telemetry.with_instance { |t| t.increment_counter(:dynflow_worker_events, 1, @telemetry_options) }
21
+ @pool.tell([:worker_done, reference, work_item]) unless already_responded
18
22
  end
19
23
  end
20
24
  end
@@ -435,19 +435,19 @@ module Dynflow
435
435
  attempts = 0
436
436
  begin
437
437
  yield
438
- rescue ::Sequel::UniqueConstraintViolation => e
439
- raise e
440
- rescue Exception => e
438
+ rescue ::Sequel::DatabaseConnectionError, ::Sequel::DatabaseDisconnectError => e
441
439
  attempts += 1
442
440
  log(:error, e)
443
441
  if attempts > MAX_RETRIES
444
442
  log(:error, "The number of MAX_RETRIES exceeded")
445
- raise Errors::PersistenceError.delegate(e)
443
+ raise Errors::FatalPersistenceError.delegate(e)
446
444
  else
447
445
  log(:error, "Persistence retry no. #{attempts}")
448
446
  sleep RETRY_DELAY
449
447
  retry
450
448
  end
449
+ rescue Exception => e
450
+ raise Errors::PersistenceError.delegate(e)
451
451
  end
452
452
  end
453
453
 
@@ -36,10 +36,10 @@ module Dynflow
36
36
  end
37
37
  init_world.tap do |world|
38
38
  @world = world
39
-
39
+ config.run_on_init_hooks(false, world)
40
40
  unless config.remote?
41
41
  config.increase_db_pool_size(world)
42
- config.run_on_init_hooks(world)
42
+ config.run_on_init_hooks(true, world)
43
43
  # leave this just for long-running executors
44
44
  unless config.rake_task_with_executor?
45
45
  invalidated_worlds = world.perform_validity_checks
@@ -39,7 +39,8 @@ module Dynflow
39
39
  self.lazy_initialization = !::Rails.env.production?
40
40
  self.rake_tasks_with_executor = %w(db:migrate db:seed)
41
41
 
42
- @on_init = []
42
+ @on_init = []
43
+ @on_executor_init = []
43
44
  end
44
45
 
45
46
  # Action related info such as exceptions raised inside the actions' methods
@@ -54,12 +55,14 @@ module Dynflow
54
55
  ::Rails.logger
55
56
  end
56
57
 
57
- def on_init(&block)
58
- @on_init << block
58
+ def on_init(executor = true, &block)
59
+ destination = executor ? @on_executor_init : @on_init
60
+ destination << block
59
61
  end
60
62
 
61
- def run_on_init_hooks(world)
62
- @on_init.each { |init| init.call(world) }
63
+ def run_on_init_hooks(executor, world)
64
+ source = executor ? @on_executor_init : @on_init
65
+ source.each { |init| init.call(world) }
63
66
  end
64
67
 
65
68
  def initialize_world(world_class = ::Dynflow::World)
@@ -114,7 +117,7 @@ module Dynflow
114
117
  @world_config ||= ::Dynflow::Config.new.tap do |config|
115
118
  config.auto_rescue = true
116
119
  config.logger_adapter = ::Dynflow::LoggerAdapters::Delegator.new(action_logger, dynflow_logger)
117
- config.pool_size = 5
120
+ config.pool_size = self.pool_size
118
121
  config.persistence_adapter = ->(world, _) { initialize_persistence(world) }
119
122
  config.transaction_adapter = transaction_adapter
120
123
  config.executor = ->(world, _) { initialize_executor(world) }
@@ -0,0 +1,65 @@
1
+ require 'dynflow/telemetry_adapters/abstract'
2
+ require 'dynflow/telemetry_adapters/dummy'
3
+ require 'dynflow/telemetry_adapters/statsd'
4
+
5
+ module Dynflow
6
+ class Telemetry
7
+ class << self
8
+ attr_reader :instance
9
+
10
+ # Configures the adapter to use for telemetry
11
+ #
12
+ # @param [TelemetryAdapters::Abstract] adapter the adapter to use
13
+ def set_adapter(adapter)
14
+ @instance = adapter
15
+ end
16
+
17
+ # Passes the block into the current telemetry adapter's
18
+ # {TelemetryAdapters::Abstract#with_instance} method
19
+ def with_instance(&block)
20
+ @instance.with_instance &block
21
+ end
22
+
23
+ def measure(name, tags = {}, &block)
24
+ @instance.measure name, tags, &block
25
+ end
26
+
27
+ # Registers the metrics to be collected
28
+ # @return [void]
29
+ def register_metrics!
30
+ return if @registered
31
+ @registered = true
32
+ with_instance do |t|
33
+ # Worker related
34
+ t.add_gauge :dynflow_active_workers, 'The number of currently busy workers',
35
+ [:queue, :world]
36
+ t.add_counter :dynflow_worker_events, 'The number of processed events',
37
+ [:queue, :world, :worker]
38
+
39
+ # Execution plan related
40
+ t.add_gauge :dynflow_active_execution_plans, 'The number of active execution plans',
41
+ [:action, :world, :state]
42
+ t.add_gauge :dynflow_queue_size, 'Number of items in queue',
43
+ [:queue, :world]
44
+ t.add_counter :dynflow_finished_execution_plans, 'The number of execution plans',
45
+ [:action, :world, :result]
46
+
47
+ # Step related
48
+ # TODO: Configure buckets in a sane manner
49
+ t.add_histogram :dynflow_step_real_time, 'The time between the start end end of the step',
50
+ [:action, :phase]
51
+ t.add_histogram :dynflow_step_execution_time, 'The time spent executing a step',
52
+ [:action, :phase]
53
+
54
+ # Connector related
55
+ t.add_counter :dynflow_connector_envelopes, 'The number of envelopes handled by a connector',
56
+ [:world, :direction]
57
+
58
+ # Persistence related
59
+ t.add_histogram :dynflow_persistence, 'The time spent communicating with the database',
60
+ [:world, :method]
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ class Abstract
4
+ # Default buckets to use when defining a histogram
5
+ DEFAULT_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200].freeze
6
+
7
+ # Configures a counter to be collected
8
+ #
9
+ # @param [String] name Name of the counter
10
+ # @param [String] description Human-readable description of the counter
11
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
12
+ # @return [void]
13
+ def add_counter(name, description, instance_labels = [])
14
+ end
15
+
16
+ # Configures a gauge to be collected
17
+ #
18
+ # @param [String] name Name of the gauge
19
+ # @param [String] description Human-readable description of the gauge
20
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
21
+ # @return [void]
22
+ def add_gauge(name, description, instance_labels = [])
23
+ end
24
+
25
+ # Configures a histogram to be collected
26
+ #
27
+ # @param [String] name Name of the histogram
28
+ # @param [String] description Human-readable description of the histogram
29
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
30
+ # @param [Array<Integer>] buckest Buckets to fit the value into
31
+ # @return [void]
32
+ def add_histogram(name, description, instance_labels = [], buckets = DEFAULT_BUCKETS)
33
+ end
34
+
35
+ # Increments a counter
36
+ #
37
+ # @param [String,Symbol] name Name of the counter to increment
38
+ # @param [Integer] value Step to increment by
39
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
40
+ # @return [void]
41
+ def increment_counter(name, value = 1, tags = {})
42
+ end
43
+
44
+ # Modifies a gauge
45
+ #
46
+ # @param [String,Symbol] name Name of the gauge to increment
47
+ # @param [String,Integer] value Step to change by
48
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
49
+ # @return [void]
50
+ def set_gauge(name, value, tags = {})
51
+ end
52
+
53
+ # Records a histogram entry
54
+ #
55
+ # @param [String,Symbol] name Name of the histogram
56
+ # @param [String,Integer] value Value to record
57
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
58
+ # @return [void]
59
+ def observe_histogram(name, value, tags = {})
60
+ end
61
+
62
+ # Passes self into the block and evaulates it
63
+ #
64
+ # @yieldparam [Abstract] adapter the current telemetry adapter
65
+ # @return [void]
66
+ def with_instance
67
+ yield self if block_given?
68
+ end
69
+
70
+ def measure(name, tags = {})
71
+ before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
72
+ yield
73
+ ensure
74
+ after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
75
+ duration = (after - before) * 1000 # In miliseconds
76
+ observe_histogram(name, duration, tags)
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,18 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ # Telemetry adapter which does not evaluate blocks passed to {#with_instance}.
4
+ class Dummy < Abstract
5
+ # Does nothing with the block passed to it
6
+ #
7
+ # @return void
8
+ def with_instance
9
+ # Do nothing
10
+ end
11
+
12
+ def measure(_name, _tags = {})
13
+ # Just call the block
14
+ yield
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,48 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ class StatsD < Abstract
4
+ def initialize(host = '127.0.0.1:8125')
5
+ require 'statsd-instrument'
6
+
7
+ @instances = {}
8
+ @host = host
9
+ ::StatsD.backend = ::StatsD::Instrument::Backends::UDPBackend.new(host, :statsd)
10
+ end
11
+
12
+ def add_counter(name, description, instance_labels)
13
+ raise "Metric already registered: #{name}" if @instances[name]
14
+ @instances[name] = instance_labels
15
+ end
16
+
17
+ def add_gauge(name, description, instance_labels)
18
+ raise "Metric already registered: #{name}" if @instances[name]
19
+ @instances[name] = instance_labels
20
+ end
21
+
22
+ def add_histogram(name, description, instance_labels, buckets = DEFAULT_BUCKETS)
23
+ raise "Metric already registered: #{name}" if @instances[name]
24
+ @instances[name] = instance_labels
25
+ end
26
+
27
+ def increment_counter(name, value, tags)
28
+ ::StatsD.increment(name_tag_mapping(name, tags), value)
29
+ end
30
+
31
+ def set_gauge(name, value, tags)
32
+ ::StatsD.gauge(name_tag_mapping(name, tags), value)
33
+ end
34
+
35
+ def observe_histogram(name, value, tags)
36
+ ::StatsD.measure(name_tag_mapping(name, tags), value)
37
+ end
38
+
39
+ private
40
+
41
+ def name_tag_mapping(name, tags)
42
+ instances = @instances[name]
43
+ return name if instances.nil? || instances.empty?
44
+ (name.to_s + '.' + instances.map {|x| tags[x]}.compact.join('.')).tr('-:/ ', '____')
45
+ end
46
+ end
47
+ end
48
+ end
@@ -1,3 +1,3 @@
1
1
  module Dynflow
2
- VERSION = '1.1.0'.freeze
2
+ VERSION = '1.1.1'.freeze
3
3
  end
@@ -14,9 +14,14 @@ module Dynflow
14
14
  :termination_timeout, :terminated, :dead_letter_handler, :execution_plan_cleaner
15
15
 
16
16
  def initialize(config)
17
+ @config = Config::ForWorld.new(config, self)
18
+
19
+ # Set the telemetry instance as soon as possible
20
+ Dynflow::Telemetry.set_adapter @config.telemetry_adapter
21
+ Dynflow::Telemetry.register_metrics!
22
+
17
23
  @id = SecureRandom.uuid
18
24
  @clock = spawn_and_wait(Clock, 'clock')
19
- @config = Config::ForWorld.new(config, self)
20
25
  @logger_adapter = @config.logger_adapter
21
26
  @config.validate
22
27
  @transaction_adapter = @config.transaction_adapter
@@ -238,8 +243,49 @@ module Dynflow
238
243
  end
239
244
 
240
245
  def terminate(future = Concurrent.future)
246
+ start_termination.tangle(future)
247
+ future
248
+ end
249
+
250
+ def terminating?
251
+ defined?(@terminating)
252
+ end
253
+
254
+ # 24119 - ensure delayed executor is preserved after invalidation
255
+ # executes plans that are planned/paused and haven't reported any error yet (usually when no executor
256
+ # was available by the time of planning or terminating)
257
+ def auto_execute
258
+ coordinator.acquire(Coordinator::AutoExecuteLock.new(self)) do
259
+ planned_execution_plans =
260
+ self.persistence.find_execution_plans filters: { 'state' => %w(planned paused), 'result' => (ExecutionPlan.results - [:error]).map(&:to_s) }
261
+ planned_execution_plans.map do |ep|
262
+ if coordinator.find_locks(Dynflow::Coordinator::ExecutionLock.unique_filter(ep.id)).empty?
263
+ execute(ep.id)
264
+ end
265
+ end.compact
266
+ end
267
+ rescue Coordinator::LockError => e
268
+ logger.info "auto-executor lock already aquired: #{e.message}"
269
+ []
270
+ end
271
+
272
+ def try_spawn(what, lock_class = nil)
273
+ object = nil
274
+ return nil if !executor || (object = @config.public_send(what)).nil?
275
+
276
+ coordinator.acquire(lock_class.new(self)) if lock_class
277
+ object.spawn.wait
278
+ object
279
+ rescue Coordinator::LockError => e
280
+ nil
281
+ end
282
+
283
+ private
284
+
285
+ def start_termination
241
286
  @termination_barrier.synchronize do
242
- @terminating ||= Concurrent.future do
287
+ return @terminating if @terminating
288
+ termination_future ||= Concurrent.future do
243
289
  begin
244
290
  run_before_termination_hooks
245
291
 
@@ -252,7 +298,7 @@ module Dynflow
252
298
  throttle_limiter.terminate.wait(termination_timeout)
253
299
 
254
300
  if executor
255
- connector.stop_receiving_new_work(self)
301
+ connector.stop_receiving_new_work(self, termination_timeout)
256
302
 
257
303
  logger.info "start terminating executor..."
258
304
  executor.terminate.wait(termination_timeout)
@@ -269,7 +315,7 @@ module Dynflow
269
315
  client_dispatcher_terminated.wait(termination_timeout)
270
316
 
271
317
  logger.info "stop listening for new events..."
272
- connector.stop_listening(self)
318
+ connector.stop_listening(self, termination_timeout)
273
319
 
274
320
  if @clock
275
321
  logger.info "start terminating clock..."
@@ -282,49 +328,15 @@ module Dynflow
282
328
  rescue => e
283
329
  logger.fatal(e)
284
330
  end
331
+ end
332
+ @terminating = Concurrent.future do
333
+ termination_future.wait(termination_timeout)
285
334
  end.on_completion do
286
335
  Thread.new { Kernel.exit } if @exit_on_terminate.true?
287
336
  end
288
337
  end
289
-
290
- @terminating.tangle(future)
291
- future
292
- end
293
-
294
- def terminating?
295
- defined?(@terminating)
296
- end
297
-
298
- # 24119 - ensure delayed executor is preserved after invalidation
299
- # executes plans that are planned/paused and haven't reported any error yet (usually when no executor
300
- # was available by the time of planning or terminating)
301
- def auto_execute
302
- coordinator.acquire(Coordinator::AutoExecuteLock.new(self)) do
303
- planned_execution_plans =
304
- self.persistence.find_execution_plans filters: { 'state' => %w(planned paused), 'result' => (ExecutionPlan.results - [:error]).map(&:to_s) }
305
- planned_execution_plans.map do |ep|
306
- if coordinator.find_locks(Dynflow::Coordinator::ExecutionLock.unique_filter(ep.id)).empty?
307
- execute(ep.id)
308
- end
309
- end.compact
310
- end
311
- rescue Coordinator::LockError => e
312
- logger.info "auto-executor lock already aquired: #{e.message}"
313
- []
314
- end
315
-
316
- def try_spawn(what, lock_class = nil)
317
- object = nil
318
- return nil if !executor || (object = @config.public_send(what)).nil?
319
-
320
- coordinator.acquire(lock_class.new(self)) if lock_class
321
- object.spawn.wait
322
- object
323
- rescue Coordinator::LockError => e
324
- nil
325
338
  end
326
339
 
327
- private
328
340
  def calculate_subscription_index
329
341
  @subscription_index =
330
342
  action_classes.each_with_object(Hash.new { |h, k| h[k] = [] }) do |klass, index|
@@ -337,11 +349,14 @@ module Dynflow
337
349
 
338
350
  def run_before_termination_hooks
339
351
  until @before_termination_hooks.empty?
340
- begin
341
- @before_termination_hooks.pop.call
342
- rescue => e
343
- logger.error e
352
+ hook_run = Concurrent.future do
353
+ begin
354
+ @before_termination_hooks.pop.call
355
+ rescue => e
356
+ logger.error e
357
+ end
344
358
  end
359
+ logger.error "timeout running before_termination_hook" unless hook_run.wait(termination_timeout)
345
360
  end
346
361
  end
347
362
 
@@ -94,6 +94,7 @@ module Dynflow
94
94
  end
95
95
 
96
96
  describe "when being executed" do
97
+ include TestHelpers
97
98
 
98
99
  let :execution_plan do
99
100
  world.plan(Support::CodeWorkflowExample::IncomingIssue, { 'text' => 'get a break' })
@@ -128,6 +129,16 @@ module Dynflow
128
129
  assert_raises(Dynflow::Error) { world.execute(execution_plan.id).value! }
129
130
  end
130
131
  end
132
+
133
+ it "handles when the execution plan is deleted" do
134
+ TestPause.when_paused do
135
+ world.persistence.delete_execution_plans(uuid: [execution_plan.id])
136
+ end
137
+ director = get_director(world)
138
+ wait_for('execution plan removed from executor') do
139
+ !director.current_execution_plan_ids.include?(execution_plan.id)
140
+ end
141
+ end
131
142
  end
132
143
  end
133
144
 
@@ -162,15 +162,21 @@ module TestHelpers
162
162
  end
163
163
  end
164
164
 
165
+ # get director for deeper investigation of the current execution state
166
+ def get_director(world)
167
+ core_context = world.executor.instance_variable_get('@core').instance_variable_get('@core').context
168
+ core_context.instance_variable_get('@director')
169
+ end
170
+
165
171
  # waits for the passed block to return non-nil value and reiterates it while getting false
166
172
  # (till some reasonable timeout). Useful for forcing the tests for some event to occur
167
- def wait_for
173
+ def wait_for(waiting_message = 'something to happen')
168
174
  30.times do
169
175
  ret = yield
170
176
  return ret if ret
171
177
  sleep 0.3
172
178
  end
173
- raise 'waiting for something to happen was not successful'
179
+ raise "waiting for #{waiting_message} was not successful"
174
180
  end
175
181
 
176
182
  def executor_id_for_plan(execution_plan_id)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dynflow
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Necas
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-07-09 00:00:00.000000000 Z
12
+ date: 2018-10-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: multi_json
@@ -379,6 +379,7 @@ files:
379
379
  - examples/singletons.rb
380
380
  - examples/sub_plan_concurrency_control.rb
381
381
  - examples/sub_plans.rb
382
+ - extras/statsd_mapping.conf
382
383
  - lib/dynflow.rb
383
384
  - lib/dynflow/action.rb
384
385
  - lib/dynflow/action/cancellable.rb
@@ -408,6 +409,7 @@ files:
408
409
  - lib/dynflow/coordinator_adapters/abstract.rb
409
410
  - lib/dynflow/coordinator_adapters/sequel.rb
410
411
  - lib/dynflow/dead_letter_silencer.rb
412
+ - lib/dynflow/debug/telemetry/persistence.rb
411
413
  - lib/dynflow/delayed_executors.rb
412
414
  - lib/dynflow/delayed_executors/abstract.rb
413
415
  - lib/dynflow/delayed_executors/abstract_core.rb
@@ -500,6 +502,9 @@ files:
500
502
  - lib/dynflow/serializers/noop.rb
501
503
  - lib/dynflow/stateful.rb
502
504
  - lib/dynflow/telemetry.rb
505
+ - lib/dynflow/telemetry_adapters/abstract.rb
506
+ - lib/dynflow/telemetry_adapters/dummy.rb
507
+ - lib/dynflow/telemetry_adapters/statsd.rb
503
508
  - lib/dynflow/testing.rb
504
509
  - lib/dynflow/testing/assertions.rb
505
510
  - lib/dynflow/testing/dummy_execution_plan.rb