dynflow 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ee35eec200e14b25add8941b4b4637e994012053a271898c4abc4a70234942e
4
- data.tar.gz: 720fc9161e5aadff8f165c12f6bb278cfa65435ef7a1d96d93df8614d21a1de8
3
+ metadata.gz: '0678aaa84d4b56ba08bbd68a6e034076f22b9f44bf85110f34a9150403490243'
4
+ data.tar.gz: c91ee787f316b332f8e802557ed31aa6490db2b62d82b595e3397549f5596aef
5
5
  SHA512:
6
- metadata.gz: 1753d21be5307643a16704a27b0e343fe55a7e74330469d1eece503d58cfa9fd04be1da57918e6e09e73f7e2822f2be2bd517e9eea58b89432bee7226d51ab93
7
- data.tar.gz: ac12ce027be3e289227db4a2b5311328342de364c0f5d51f5badc27e7ba484a6f137488ce681d319e050b1a04ef55ac4821d0d87a06d6d70019f3b2c6e889fca
6
+ metadata.gz: '003039069ca213958db31c0dcb6d158c8ab675182881ac4f4edfbafbbec826ca36fe51ce606bbd2621226851ac72bde934995d4aae36a6da43410ee573917e27'
7
+ data.tar.gz: 5321596fe890b8398f3e207c39991a72febf76f6238bc4c5c91df1634e66ded7149574624846fead73d52270a089a429928bbd8b3d4f6f33318ad6c880180f95
@@ -20,3 +20,7 @@ Style/MultilineOperationIndentation:
20
20
  # Cop supports --auto-correct.
21
21
  Style/EmptyLines:
22
22
  Enabled: true
23
+
24
+ Metrics/ModuleLength:
25
+ Exclude:
26
+ - test/**/*
data/Gemfile CHANGED
@@ -41,3 +41,7 @@ group :rails do
41
41
  gem 'rails', '>= 4.2.9'
42
42
  gem 'logging'
43
43
  end
44
+
45
+ group :telemetry do
46
+ gem 'statsd-instrument'
47
+ end
@@ -17,6 +17,7 @@ class ExampleHelper
17
17
  config.persistence_adapter = persistence_adapter
18
18
  config.logger_adapter = logger_adapter
19
19
  config.auto_rescue = false
20
+ config.telemetry_adapter = telemetry_adapter
20
21
  yield config if block_given?
21
22
  Dynflow::World.new(config).tap do |world|
22
23
  puts "World #{world.id} started..."
@@ -27,6 +28,14 @@ class ExampleHelper
27
28
  ENV['DB_CONN_STRING'] || 'sqlite:/'
28
29
  end
29
30
 
31
+ def telemetry_adapter
32
+ if (host = ENV['TELEMETRY_STATSD_HOST'])
33
+ Dynflow::TelemetryAdapters::StatsD.new host
34
+ else
35
+ Dynflow::TelemetryAdapters::Dummy.new
36
+ end
37
+ end
38
+
30
39
  def persistence_adapter
31
40
  Dynflow::PersistenceAdapters::Sequel.new persistence_conn_string
32
41
  end
@@ -0,0 +1,65 @@
1
+ ---
2
+ mappings:
3
+ - name: dynflow_active_execution_plans
4
+ match: dynflow_active_execution_plans.*.*.*
5
+ labels:
6
+ action: "$1"
7
+ world: "$2"
8
+ state: "$3"
9
+ help: The number of active execution plans
10
+ - name: dynflow_active_workers
11
+ match: dynflow_active_workers.*.*
12
+ labels:
13
+ queue: "$1"
14
+ world: "$2"
15
+ help: The number of currently busy workers
16
+ - name: dynflow_queue_size
17
+ match: dynflow_queue_size.*.*
18
+ labels:
19
+ queue: "$1"
20
+ world: "$2"
21
+ help: The number of events in the queue
22
+ - name: dynflow_connector_envelopes
23
+ match: dynflow_connector_envelopes.*.*
24
+ labels:
25
+ world: "$1"
26
+ direction: "$2"
27
+ help: The number of envelopes handled by a connector
28
+ - name: dynflow_finished_execution_plans
29
+ match: dynflow_finished_execution_plans.*.*.*
30
+ labels:
31
+ action: "$1"
32
+ world: "$2"
33
+ result: "$3"
34
+ help: The number of execution plans
35
+ - name: dynflow_step_execution_time
36
+ match: dynflow_step_execution_time.*.*
37
+ labels:
38
+ action: "$1"
39
+ phase: "$2"
40
+ help: The time spent executing a step
41
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
42
+ timer_type: histogram
43
+ - name: dynflow_step_real_time
44
+ match: dynflow_step_real_time.*.*
45
+ labels:
46
+ action: "$1"
47
+ phase: "$2"
48
+ help: The time between the start end end of the step
49
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
50
+ timer_type: histogram
51
+ - name: dynflow_worker_events
52
+ match: dynflow_worker_events.*.*.*
53
+ labels:
54
+ queue: "$1"
55
+ world: "$2"
56
+ worker: "$3"
57
+ help: The number of processed events
58
+ - name: dynflow_persistence
59
+ match: dynflow_persistence.*.*
60
+ labels:
61
+ world: "$1"
62
+ method: "$2"
63
+ help: The time spent communicating with the database
64
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200]
65
+ timer_type: histogram
@@ -50,6 +50,7 @@ module Dynflow
50
50
  require 'dynflow/delayed_executors'
51
51
  require 'dynflow/semaphores'
52
52
  require 'dynflow/throttle_limiter'
53
+ require 'dynflow/telemetry'
53
54
  require 'dynflow/config'
54
55
 
55
56
  if defined? ::ActiveJob
@@ -511,6 +511,7 @@ module Dynflow
511
511
  end
512
512
  end
513
513
 
514
+ # TODO: This is getting out of hand, refactoring needed
514
515
  def execute_run(event)
515
516
  phase! Run
516
517
  @world.logger.debug format('%13s %s:%2d got event %s',
@@ -544,7 +545,6 @@ module Dynflow
544
545
 
545
546
  check_serializable :output
546
547
  end
547
-
548
548
  else
549
549
  raise "wrong state #{state} when event:#{event}"
550
550
  end
@@ -45,6 +45,10 @@ module Dynflow
45
45
  def label
46
46
  input[:job_class]
47
47
  end
48
+
49
+ def rescue_strategy
50
+ Action::Rescue::Skip
51
+ end
48
52
  end
49
53
  end
50
54
  end
@@ -182,6 +182,10 @@ module Dynflow
182
182
  './backup'
183
183
  end
184
184
 
185
+ config_attr :telemetry_adapter, ::Dynflow::TelemetryAdapters::Abstract do |world|
186
+ ::Dynflow::TelemetryAdapters::Dummy.new
187
+ end
188
+
185
189
  def validate(config_for_world)
186
190
  if defined? ::ActiveRecord::Base
187
191
  begin
@@ -8,7 +8,11 @@ module Dynflow
8
8
  raise NotImplementedError
9
9
  end
10
10
 
11
- def stop_listening(world)
11
+ def stop_receiving_new_work(_, timeout = nil)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def stop_listening(world, timeout = nil)
12
16
  raise NotImplementedError
13
17
  end
14
18
 
@@ -24,6 +28,7 @@ module Dynflow
24
28
  # between words: we need to know the one to send the message to
25
29
  def receive(world, envelope)
26
30
  Type! envelope, Dispatcher::Envelope
31
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => world.id, :direction => 'incoming') }
27
32
  match(envelope.message,
28
33
  (on Dispatcher::Ping do
29
34
  response_envelope = envelope.build_response_envelope(Dispatcher::Pong, world)
@@ -159,15 +159,16 @@ module Dynflow
159
159
  @core.ask([:start_listening, world])
160
160
  end
161
161
 
162
- def stop_receiving_new_work(_)
163
- @core.ask(:stop_receiving_new_work).wait
162
+ def stop_receiving_new_work(_, timeout = nil)
163
+ @core.ask(:stop_receiving_new_work).wait(timeout)
164
164
  end
165
165
 
166
- def stop_listening(_)
167
- @core.ask(:stop_listening).then { @core.ask(:terminate!) }.wait
166
+ def stop_listening(_, timeout = nil)
167
+ @core.ask(:stop_listening).then { @core.ask(:terminate!) }.wait(timeout)
168
168
  end
169
169
 
170
170
  def send(envelope)
171
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => envelope.sender_id, :direction => 'outgoing') }
171
172
  @core.ask([:handle_envelope, envelope])
172
173
  end
173
174
  end
@@ -55,15 +55,16 @@ module Dynflow
55
55
  @core.ask([:start_listening, world])
56
56
  end
57
57
 
58
- def stop_receiving_new_work(world)
59
- @core.ask([:stop_receiving_new_work, world]).wait
58
+ def stop_receiving_new_work(world, timeout = nil)
59
+ @core.ask([:stop_receiving_new_work, world]).wait(timeout)
60
60
  end
61
61
 
62
- def stop_listening(world)
63
- @core.ask([:stop_listening, world]).wait
62
+ def stop_listening(world, timeout = nil)
63
+ @core.ask([:stop_listening, world]).wait(timeout)
64
64
  end
65
65
 
66
66
  def send(envelope)
67
+ Telemetry.with_instance { |t| t.increment_counter(:dynflow_connector_envelopes, 1, :world => envelope.sender_id) }
67
68
  @core.ask([:handle_envelope, envelope])
68
69
  end
69
70
  end
@@ -9,8 +9,12 @@ module Dynflow
9
9
 
10
10
  def create_record(record)
11
11
  @sequel_adapter.insert_coordinator_record(record.to_hash)
12
- rescue ::Sequel::UniqueConstraintViolation
13
- raise Coordinator::DuplicateRecordError.new(record)
12
+ rescue Errors::PersistenceError => e
13
+ if e.cause.is_a? ::Sequel::UniqueConstraintViolation
14
+ raise Coordinator::DuplicateRecordError.new(record)
15
+ else
16
+ raise e
17
+ end
14
18
  end
15
19
 
16
20
  def update_record(record)
@@ -0,0 +1,42 @@
1
+ module Dynflow
2
+ module Debug
3
+ module Telemetry
4
+ module Persistence
5
+ methods = [
6
+ :load_action,
7
+ :load_actions,
8
+ :load_action_for_presentation,
9
+ :load_action,
10
+ :load_actions,
11
+ :load_action_for_presentation,
12
+ :load_actions_attributes,
13
+ :save_action,
14
+ :find_execution_plans,
15
+ :find_execution_plan_counts,
16
+ :delete_execution_plans,
17
+ :load_execution_plan,
18
+ :save_execution_plan,
19
+ :find_old_execution_plans,
20
+ :find_past_delayed_plans,
21
+ :delete_delayed_plans,
22
+ :save_delayed_plan,
23
+ :set_delayed_plan_frozen,
24
+ :load_delayed_plan,
25
+ :load_step,
26
+ :load_steps,
27
+ :save_step,
28
+ :push_envelope,
29
+ :pull_envelopes
30
+ ]
31
+
32
+ methods.each do |name|
33
+ define_method(name) do |*args|
34
+ Dynflow::Telemetry.measure(:dynflow_persistence, :method => name, :world => @world.id) { super *args }
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ ::Dynflow::Persistence.send(:prepend, ::Dynflow::Debug::Persistence)
@@ -84,6 +84,10 @@ module Dynflow
84
84
  @rescued_steps = {}
85
85
  end
86
86
 
87
+ def current_execution_plan_ids
88
+ @execution_plan_managers.keys
89
+ end
90
+
87
91
  def start_execution(execution_plan_id, finished)
88
92
  manager = track_execution_plan(execution_plan_id, finished)
89
93
  return [] unless manager
@@ -108,6 +112,17 @@ module Dynflow
108
112
  unless_done(manager, manager.what_is_next(work))
109
113
  end
110
114
 
115
+ # called when there was an unhandled exception during the execution
116
+ # of the work (such as persistence issue) - in this case we just clean up the
117
+ # runtime from the execution plan and let it go (common cause for this is the execution
118
+ # plan being removed from database by external user)
119
+ def work_failed(work)
120
+ if (manager = @execution_plan_managers[work.execution_plan_id])
121
+ manager.terminate
122
+ finish_manager(manager)
123
+ end
124
+ end
125
+
111
126
  def terminate
112
127
  unless @execution_plan_managers.empty?
113
128
  logger.error "... cleaning #{@execution_plan_managers.size} execution plans ..."
@@ -34,6 +34,7 @@ module Dynflow
34
34
  class DataConsistencyError < Dynflow::Error
35
35
  end
36
36
 
37
+ # any persistence errors
37
38
  class PersistenceError < Dynflow::Error
38
39
  def self.delegate(original_exception)
39
40
  self.new("caused by #{original_exception.class}: #{original_exception.message}").tap do |e|
@@ -41,5 +42,9 @@ module Dynflow
41
42
  end
42
43
  end
43
44
  end
45
+
46
+ # persistence errors that can't be recovered from, such as continuous connection issues
47
+ class FatalPersistenceError < PersistenceError
48
+ end
44
49
  end
45
50
  end
@@ -120,7 +120,12 @@ module Dynflow
120
120
  @ended_at = Time.now
121
121
  @real_time = @ended_at - @started_at unless @started_at.nil?
122
122
  @execution_time = compute_execution_time
123
- hooks_to_run << (failure? ? :failure : :success)
123
+ key = failure? ? :failure : :success
124
+ Dynflow::Telemetry.with_instance do |t|
125
+ t.increment_counter(:dynflow_finished_execution_plans, 1,
126
+ telemetry_common_options.merge(:result => key.to_s))
127
+ end
128
+ hooks_to_run << key
124
129
  unlock_all_singleton_locks!
125
130
  when :paused
126
131
  unlock_all_singleton_locks!
@@ -130,6 +135,8 @@ module Dynflow
130
135
  logger.debug format('%13s %s %9s >> %9s',
131
136
  'ExecutionPlan', id, original, state)
132
137
  self.save
138
+ toggle_telemetry_state original == :pending ? nil : original.to_s,
139
+ self.state == :stopped ? nil : self.state.to_s
133
140
  hooks_to_run.each { |kind| run_hooks kind }
134
141
  end
135
142
 
@@ -548,6 +555,21 @@ module Dynflow
548
555
  end
549
556
  end
550
557
 
558
+ def toggle_telemetry_state(original, new)
559
+ return if original == new
560
+ @label = root_plan_step.action_class if @label.nil?
561
+ Dynflow::Telemetry.with_instance do |t|
562
+ t.set_gauge(:dynflow_active_execution_plans, '-1',
563
+ telemetry_common_options.merge(:state => original)) unless original.nil?
564
+ t.set_gauge(:dynflow_active_execution_plans, '+1',
565
+ telemetry_common_options.merge(:state => new)) unless new.nil?
566
+ end
567
+ end
568
+
569
+ def telemetry_common_options
570
+ { :world => @world.id, :action => @label }
571
+ end
572
+
551
573
  private_class_method :steps_from_hash
552
574
  end
553
575
  # rubocop:enable Metrics/ClassLength
@@ -162,9 +162,11 @@ module Dynflow
162
162
  block.call
163
163
  ensure
164
164
  calculate_progress(action)
165
- @ended_at = Time.now
166
- @execution_time += @ended_at - start
165
+ @ended_at = Time.now
166
+ current_execution_time = @ended_at - start
167
+ @execution_time += current_execution_time
167
168
  @real_time = @ended_at - @started_at
169
+ update_step_telemetry(current_execution_time)
168
170
  end
169
171
 
170
172
  def calculate_progress(action)
@@ -174,6 +176,19 @@ module Dynflow
174
176
  @progress_done = 0
175
177
  end
176
178
  end
179
+
180
+ def update_step_telemetry(current_execution_time)
181
+ Dynflow::Telemetry.with_instance do |t|
182
+ if [:success, :skipped].include?(state)
183
+ t.observe_histogram(:dynflow_step_real_time,
184
+ real_time * 1000,
185
+ :action => action_class.to_s, :phase => phase.to_s_humanized)
186
+ end
187
+ t.observe_histogram(:dynflow_step_execution_time,
188
+ current_execution_time * 1000,
189
+ :action => action_class.to_s, :phase => phase.to_s_humanized)
190
+ end
191
+ end
177
192
  end
178
193
  end
179
194
  end
@@ -21,8 +21,8 @@ module Dynflow
21
21
  default_pool_size = @queues_options[:default][:pool_size]
22
22
  @queues_options.each do |(queue_name, queue_options)|
23
23
  queue_pool_size = queue_options.fetch(:pool_size, default_pool_size)
24
- @pools[queue_name] = Pool.spawn("pool #{queue_name}", reference,
25
- queue_name, queue_pool_size,
24
+ @pools[queue_name] = Pool.spawn("pool #{queue_name}", @world,
25
+ reference, queue_name, queue_pool_size,
26
26
  @world.transaction_adapter)
27
27
  end
28
28
  end
@@ -49,10 +49,14 @@ module Dynflow
49
49
  feed_pool(@director.work_finished(work))
50
50
  end
51
51
 
52
- def handle_persistence_error(error)
53
- logger.fatal "PersistenceError in executor: terminating"
54
- logger.fatal error
55
- @world.terminate
52
+ def handle_persistence_error(error, work = nil)
53
+ logger.error "PersistenceError in executor"
54
+ logger.error error
55
+ @director.work_failed(work) if work
56
+ if error.is_a? Errors::FatalPersistenceError
57
+ logger.fatal "Terminating"
58
+ @world.terminate
59
+ end
56
60
  end
57
61
 
58
62
  def start_termination(*args)
@@ -66,7 +70,7 @@ module Dynflow
66
70
  # we expect this message from all worker pools
67
71
  return unless @pools.empty?
68
72
  @director.terminate
69
- logger.error '... core terminated.'
73
+ logger.info '... Dynflow core terminated.'
70
74
  super()
71
75
  end
72
76
 
@@ -19,6 +19,10 @@ module Dynflow
19
19
  @jobs[execution_plan_id].shift.tap { delete execution_plan_id if @jobs[execution_plan_id].empty? }
20
20
  end
21
21
 
22
+ def queue_size
23
+ execution_status.values.reduce(0, :+)
24
+ end
25
+
22
26
  def empty?
23
27
  @jobs.empty?
24
28
  end
@@ -46,27 +50,35 @@ module Dynflow
46
50
  end
47
51
  end
48
52
 
49
- def initialize(core, name, pool_size, transaction_adapter)
53
+ def initialize(world, core, name, pool_size, transaction_adapter)
54
+ @world = world
50
55
  @name = name
51
56
  @executor_core = core
52
57
  @pool_size = pool_size
53
- @free_workers = Array.new(pool_size) { |i| Worker.spawn("worker-#{i}", reference, transaction_adapter) }
54
58
  @jobs = JobStorage.new
59
+ @free_workers = Array.new(pool_size) do |i|
60
+ name = "worker-#{i}"
61
+ Worker.spawn(name, reference, transaction_adapter, telemetry_options.merge(:worker => name))
62
+ end
55
63
  end
56
64
 
57
65
  def schedule_work(work)
58
66
  @jobs.add work
59
67
  distribute_jobs
68
+ update_telemetry
60
69
  end
61
70
 
62
71
  def worker_done(worker, work)
63
72
  @executor_core.tell([:work_finished, work])
64
73
  @free_workers << worker
74
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_active_workers, -1, telemetry_options) }
65
75
  distribute_jobs
66
76
  end
67
77
 
68
- def handle_persistence_error(error)
69
- @executor_core.tell([:handle_persistence_error, error])
78
+ def handle_persistence_error(worker, error, work = nil)
79
+ @executor_core.tell([:handle_persistence_error, error, work])
80
+ @free_workers << worker
81
+ distribute_jobs
70
82
  end
71
83
 
72
84
  def start_termination(*args)
@@ -92,7 +104,19 @@ module Dynflow
92
104
 
93
105
  def distribute_jobs
94
106
  try_to_terminate
95
- @free_workers.pop << @jobs.pop until @free_workers.empty? || @jobs.empty?
107
+ until @free_workers.empty? || @jobs.empty?
108
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_active_workers, '+1', telemetry_options) }
109
+ @free_workers.pop << @jobs.pop
110
+ update_telemetry
111
+ end
112
+ end
113
+
114
+ def telemetry_options
115
+ { :queue => @name.to_s, :world => @world.id }
116
+ end
117
+
118
+ def update_telemetry
119
+ Dynflow::Telemetry.with_instance { |t| t.set_gauge(:dynflow_queue_size, @jobs.queue_size, telemetry_options) }
96
120
  end
97
121
  end
98
122
  end
@@ -2,19 +2,23 @@ module Dynflow
2
2
  module Executors
3
3
  class Parallel < Abstract
4
4
  class Worker < Actor
5
- def initialize(pool, transaction_adapter)
5
+ def initialize(pool, transaction_adapter, telemetry_options = {})
6
6
  @pool = Type! pool, Concurrent::Actor::Reference
7
7
  @transaction_adapter = Type! transaction_adapter, TransactionAdapters::Abstract
8
+ @telemetry_options = telemetry_options
8
9
  end
9
10
 
10
11
  def on_message(work_item)
12
+ already_responded = false
11
13
  Executors.run_user_code do
12
14
  work_item.execute
13
15
  end
14
16
  rescue Errors::PersistenceError => e
15
- @pool.tell([:handle_persistence_error, e])
17
+ @pool.tell([:handle_persistence_error, reference, e, work_item])
18
+ already_responded = true
16
19
  ensure
17
- @pool.tell([:worker_done, reference, work_item])
20
+ Dynflow::Telemetry.with_instance { |t| t.increment_counter(:dynflow_worker_events, 1, @telemetry_options) }
21
+ @pool.tell([:worker_done, reference, work_item]) unless already_responded
18
22
  end
19
23
  end
20
24
  end
@@ -435,19 +435,19 @@ module Dynflow
435
435
  attempts = 0
436
436
  begin
437
437
  yield
438
- rescue ::Sequel::UniqueConstraintViolation => e
439
- raise e
440
- rescue Exception => e
438
+ rescue ::Sequel::DatabaseConnectionError, ::Sequel::DatabaseDisconnectError => e
441
439
  attempts += 1
442
440
  log(:error, e)
443
441
  if attempts > MAX_RETRIES
444
442
  log(:error, "The number of MAX_RETRIES exceeded")
445
- raise Errors::PersistenceError.delegate(e)
443
+ raise Errors::FatalPersistenceError.delegate(e)
446
444
  else
447
445
  log(:error, "Persistence retry no. #{attempts}")
448
446
  sleep RETRY_DELAY
449
447
  retry
450
448
  end
449
+ rescue Exception => e
450
+ raise Errors::PersistenceError.delegate(e)
451
451
  end
452
452
  end
453
453
 
@@ -36,10 +36,10 @@ module Dynflow
36
36
  end
37
37
  init_world.tap do |world|
38
38
  @world = world
39
-
39
+ config.run_on_init_hooks(false, world)
40
40
  unless config.remote?
41
41
  config.increase_db_pool_size(world)
42
- config.run_on_init_hooks(world)
42
+ config.run_on_init_hooks(true, world)
43
43
  # leave this just for long-running executors
44
44
  unless config.rake_task_with_executor?
45
45
  invalidated_worlds = world.perform_validity_checks
@@ -39,7 +39,8 @@ module Dynflow
39
39
  self.lazy_initialization = !::Rails.env.production?
40
40
  self.rake_tasks_with_executor = %w(db:migrate db:seed)
41
41
 
42
- @on_init = []
42
+ @on_init = []
43
+ @on_executor_init = []
43
44
  end
44
45
 
45
46
  # Action related info such as exceptions raised inside the actions' methods
@@ -54,12 +55,14 @@ module Dynflow
54
55
  ::Rails.logger
55
56
  end
56
57
 
57
- def on_init(&block)
58
- @on_init << block
58
+ def on_init(executor = true, &block)
59
+ destination = executor ? @on_executor_init : @on_init
60
+ destination << block
59
61
  end
60
62
 
61
- def run_on_init_hooks(world)
62
- @on_init.each { |init| init.call(world) }
63
+ def run_on_init_hooks(executor, world)
64
+ source = executor ? @on_executor_init : @on_init
65
+ source.each { |init| init.call(world) }
63
66
  end
64
67
 
65
68
  def initialize_world(world_class = ::Dynflow::World)
@@ -114,7 +117,7 @@ module Dynflow
114
117
  @world_config ||= ::Dynflow::Config.new.tap do |config|
115
118
  config.auto_rescue = true
116
119
  config.logger_adapter = ::Dynflow::LoggerAdapters::Delegator.new(action_logger, dynflow_logger)
117
- config.pool_size = 5
120
+ config.pool_size = self.pool_size
118
121
  config.persistence_adapter = ->(world, _) { initialize_persistence(world) }
119
122
  config.transaction_adapter = transaction_adapter
120
123
  config.executor = ->(world, _) { initialize_executor(world) }
@@ -0,0 +1,65 @@
1
+ require 'dynflow/telemetry_adapters/abstract'
2
+ require 'dynflow/telemetry_adapters/dummy'
3
+ require 'dynflow/telemetry_adapters/statsd'
4
+
5
+ module Dynflow
6
+ class Telemetry
7
+ class << self
8
+ attr_reader :instance
9
+
10
+ # Configures the adapter to use for telemetry
11
+ #
12
+ # @param [TelemetryAdapters::Abstract] adapter the adapter to use
13
+ def set_adapter(adapter)
14
+ @instance = adapter
15
+ end
16
+
17
+ # Passes the block into the current telemetry adapter's
18
+ # {TelemetryAdapters::Abstract#with_instance} method
19
+ def with_instance(&block)
20
+ @instance.with_instance &block
21
+ end
22
+
23
+ def measure(name, tags = {}, &block)
24
+ @instance.measure name, tags, &block
25
+ end
26
+
27
+ # Registers the metrics to be collected
28
+ # @return [void]
29
+ def register_metrics!
30
+ return if @registered
31
+ @registered = true
32
+ with_instance do |t|
33
+ # Worker related
34
+ t.add_gauge :dynflow_active_workers, 'The number of currently busy workers',
35
+ [:queue, :world]
36
+ t.add_counter :dynflow_worker_events, 'The number of processed events',
37
+ [:queue, :world, :worker]
38
+
39
+ # Execution plan related
40
+ t.add_gauge :dynflow_active_execution_plans, 'The number of active execution plans',
41
+ [:action, :world, :state]
42
+ t.add_gauge :dynflow_queue_size, 'Number of items in queue',
43
+ [:queue, :world]
44
+ t.add_counter :dynflow_finished_execution_plans, 'The number of execution plans',
45
+ [:action, :world, :result]
46
+
47
+ # Step related
48
+ # TODO: Configure buckets in a sane manner
49
+ t.add_histogram :dynflow_step_real_time, 'The time between the start end end of the step',
50
+ [:action, :phase]
51
+ t.add_histogram :dynflow_step_execution_time, 'The time spent executing a step',
52
+ [:action, :phase]
53
+
54
+ # Connector related
55
+ t.add_counter :dynflow_connector_envelopes, 'The number of envelopes handled by a connector',
56
+ [:world, :direction]
57
+
58
+ # Persistence related
59
+ t.add_histogram :dynflow_persistence, 'The time spent communicating with the database',
60
+ [:world, :method]
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ class Abstract
4
+ # Default buckets to use when defining a histogram
5
+ DEFAULT_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 120, 300, 600, 1200].freeze
6
+
7
+ # Configures a counter to be collected
8
+ #
9
+ # @param [String] name Name of the counter
10
+ # @param [String] description Human-readable description of the counter
11
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
12
+ # @return [void]
13
+ def add_counter(name, description, instance_labels = [])
14
+ end
15
+
16
+ # Configures a gauge to be collected
17
+ #
18
+ # @param [String] name Name of the gauge
19
+ # @param [String] description Human-readable description of the gauge
20
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
21
+ # @return [void]
22
+ def add_gauge(name, description, instance_labels = [])
23
+ end
24
+
25
+ # Configures a histogram to be collected
26
+ #
27
+ # @param [String] name Name of the histogram
28
+ # @param [String] description Human-readable description of the histogram
29
+ # @param [Array<String>] instance_labels Labels which will be assigned to the collected data
30
+ # @param [Array<Integer>] buckest Buckets to fit the value into
31
+ # @return [void]
32
+ def add_histogram(name, description, instance_labels = [], buckets = DEFAULT_BUCKETS)
33
+ end
34
+
35
+ # Increments a counter
36
+ #
37
+ # @param [String,Symbol] name Name of the counter to increment
38
+ # @param [Integer] value Step to increment by
39
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
40
+ # @return [void]
41
+ def increment_counter(name, value = 1, tags = {})
42
+ end
43
+
44
+ # Modifies a gauge
45
+ #
46
+ # @param [String,Symbol] name Name of the gauge to increment
47
+ # @param [String,Integer] value Step to change by
48
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
49
+ # @return [void]
50
+ def set_gauge(name, value, tags = {})
51
+ end
52
+
53
+ # Records a histogram entry
54
+ #
55
+ # @param [String,Symbol] name Name of the histogram
56
+ # @param [String,Integer] value Value to record
57
+ # @param [Hash{Symbol=>String}] tags Tags to apply to this record
58
+ # @return [void]
59
+ def observe_histogram(name, value, tags = {})
60
+ end
61
+
62
+ # Passes self into the block and evaulates it
63
+ #
64
+ # @yieldparam [Abstract] adapter the current telemetry adapter
65
+ # @return [void]
66
+ def with_instance
67
+ yield self if block_given?
68
+ end
69
+
70
+ def measure(name, tags = {})
71
+ before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
72
+ yield
73
+ ensure
74
+ after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
75
+ duration = (after - before) * 1000 # In miliseconds
76
+ observe_histogram(name, duration, tags)
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,18 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ # Telemetry adapter which does not evaluate blocks passed to {#with_instance}.
4
+ class Dummy < Abstract
5
+ # Does nothing with the block passed to it
6
+ #
7
+ # @return void
8
+ def with_instance
9
+ # Do nothing
10
+ end
11
+
12
+ def measure(_name, _tags = {})
13
+ # Just call the block
14
+ yield
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,48 @@
1
+ module Dynflow
2
+ module TelemetryAdapters
3
+ class StatsD < Abstract
4
+ def initialize(host = '127.0.0.1:8125')
5
+ require 'statsd-instrument'
6
+
7
+ @instances = {}
8
+ @host = host
9
+ ::StatsD.backend = ::StatsD::Instrument::Backends::UDPBackend.new(host, :statsd)
10
+ end
11
+
12
+ def add_counter(name, description, instance_labels)
13
+ raise "Metric already registered: #{name}" if @instances[name]
14
+ @instances[name] = instance_labels
15
+ end
16
+
17
+ def add_gauge(name, description, instance_labels)
18
+ raise "Metric already registered: #{name}" if @instances[name]
19
+ @instances[name] = instance_labels
20
+ end
21
+
22
+ def add_histogram(name, description, instance_labels, buckets = DEFAULT_BUCKETS)
23
+ raise "Metric already registered: #{name}" if @instances[name]
24
+ @instances[name] = instance_labels
25
+ end
26
+
27
+ def increment_counter(name, value, tags)
28
+ ::StatsD.increment(name_tag_mapping(name, tags), value)
29
+ end
30
+
31
+ def set_gauge(name, value, tags)
32
+ ::StatsD.gauge(name_tag_mapping(name, tags), value)
33
+ end
34
+
35
+ def observe_histogram(name, value, tags)
36
+ ::StatsD.measure(name_tag_mapping(name, tags), value)
37
+ end
38
+
39
+ private
40
+
41
+ def name_tag_mapping(name, tags)
42
+ instances = @instances[name]
43
+ return name if instances.nil? || instances.empty?
44
+ (name.to_s + '.' + instances.map {|x| tags[x]}.compact.join('.')).tr('-:/ ', '____')
45
+ end
46
+ end
47
+ end
48
+ end
@@ -1,3 +1,3 @@
1
1
  module Dynflow
2
- VERSION = '1.1.0'.freeze
2
+ VERSION = '1.1.1'.freeze
3
3
  end
@@ -14,9 +14,14 @@ module Dynflow
14
14
  :termination_timeout, :terminated, :dead_letter_handler, :execution_plan_cleaner
15
15
 
16
16
  def initialize(config)
17
+ @config = Config::ForWorld.new(config, self)
18
+
19
+ # Set the telemetry instance as soon as possible
20
+ Dynflow::Telemetry.set_adapter @config.telemetry_adapter
21
+ Dynflow::Telemetry.register_metrics!
22
+
17
23
  @id = SecureRandom.uuid
18
24
  @clock = spawn_and_wait(Clock, 'clock')
19
- @config = Config::ForWorld.new(config, self)
20
25
  @logger_adapter = @config.logger_adapter
21
26
  @config.validate
22
27
  @transaction_adapter = @config.transaction_adapter
@@ -238,8 +243,49 @@ module Dynflow
238
243
  end
239
244
 
240
245
  def terminate(future = Concurrent.future)
246
+ start_termination.tangle(future)
247
+ future
248
+ end
249
+
250
+ def terminating?
251
+ defined?(@terminating)
252
+ end
253
+
254
+ # 24119 - ensure delayed executor is preserved after invalidation
255
+ # executes plans that are planned/paused and haven't reported any error yet (usually when no executor
256
+ # was available by the time of planning or terminating)
257
+ def auto_execute
258
+ coordinator.acquire(Coordinator::AutoExecuteLock.new(self)) do
259
+ planned_execution_plans =
260
+ self.persistence.find_execution_plans filters: { 'state' => %w(planned paused), 'result' => (ExecutionPlan.results - [:error]).map(&:to_s) }
261
+ planned_execution_plans.map do |ep|
262
+ if coordinator.find_locks(Dynflow::Coordinator::ExecutionLock.unique_filter(ep.id)).empty?
263
+ execute(ep.id)
264
+ end
265
+ end.compact
266
+ end
267
+ rescue Coordinator::LockError => e
268
+ logger.info "auto-executor lock already aquired: #{e.message}"
269
+ []
270
+ end
271
+
272
+ def try_spawn(what, lock_class = nil)
273
+ object = nil
274
+ return nil if !executor || (object = @config.public_send(what)).nil?
275
+
276
+ coordinator.acquire(lock_class.new(self)) if lock_class
277
+ object.spawn.wait
278
+ object
279
+ rescue Coordinator::LockError => e
280
+ nil
281
+ end
282
+
283
+ private
284
+
285
+ def start_termination
241
286
  @termination_barrier.synchronize do
242
- @terminating ||= Concurrent.future do
287
+ return @terminating if @terminating
288
+ termination_future ||= Concurrent.future do
243
289
  begin
244
290
  run_before_termination_hooks
245
291
 
@@ -252,7 +298,7 @@ module Dynflow
252
298
  throttle_limiter.terminate.wait(termination_timeout)
253
299
 
254
300
  if executor
255
- connector.stop_receiving_new_work(self)
301
+ connector.stop_receiving_new_work(self, termination_timeout)
256
302
 
257
303
  logger.info "start terminating executor..."
258
304
  executor.terminate.wait(termination_timeout)
@@ -269,7 +315,7 @@ module Dynflow
269
315
  client_dispatcher_terminated.wait(termination_timeout)
270
316
 
271
317
  logger.info "stop listening for new events..."
272
- connector.stop_listening(self)
318
+ connector.stop_listening(self, termination_timeout)
273
319
 
274
320
  if @clock
275
321
  logger.info "start terminating clock..."
@@ -282,49 +328,15 @@ module Dynflow
282
328
  rescue => e
283
329
  logger.fatal(e)
284
330
  end
331
+ end
332
+ @terminating = Concurrent.future do
333
+ termination_future.wait(termination_timeout)
285
334
  end.on_completion do
286
335
  Thread.new { Kernel.exit } if @exit_on_terminate.true?
287
336
  end
288
337
  end
289
-
290
- @terminating.tangle(future)
291
- future
292
- end
293
-
294
- def terminating?
295
- defined?(@terminating)
296
- end
297
-
298
- # 24119 - ensure delayed executor is preserved after invalidation
299
- # executes plans that are planned/paused and haven't reported any error yet (usually when no executor
300
- # was available by the time of planning or terminating)
301
- def auto_execute
302
- coordinator.acquire(Coordinator::AutoExecuteLock.new(self)) do
303
- planned_execution_plans =
304
- self.persistence.find_execution_plans filters: { 'state' => %w(planned paused), 'result' => (ExecutionPlan.results - [:error]).map(&:to_s) }
305
- planned_execution_plans.map do |ep|
306
- if coordinator.find_locks(Dynflow::Coordinator::ExecutionLock.unique_filter(ep.id)).empty?
307
- execute(ep.id)
308
- end
309
- end.compact
310
- end
311
- rescue Coordinator::LockError => e
312
- logger.info "auto-executor lock already aquired: #{e.message}"
313
- []
314
- end
315
-
316
- def try_spawn(what, lock_class = nil)
317
- object = nil
318
- return nil if !executor || (object = @config.public_send(what)).nil?
319
-
320
- coordinator.acquire(lock_class.new(self)) if lock_class
321
- object.spawn.wait
322
- object
323
- rescue Coordinator::LockError => e
324
- nil
325
338
  end
326
339
 
327
- private
328
340
  def calculate_subscription_index
329
341
  @subscription_index =
330
342
  action_classes.each_with_object(Hash.new { |h, k| h[k] = [] }) do |klass, index|
@@ -337,11 +349,14 @@ module Dynflow
337
349
 
338
350
  def run_before_termination_hooks
339
351
  until @before_termination_hooks.empty?
340
- begin
341
- @before_termination_hooks.pop.call
342
- rescue => e
343
- logger.error e
352
+ hook_run = Concurrent.future do
353
+ begin
354
+ @before_termination_hooks.pop.call
355
+ rescue => e
356
+ logger.error e
357
+ end
344
358
  end
359
+ logger.error "timeout running before_termination_hook" unless hook_run.wait(termination_timeout)
345
360
  end
346
361
  end
347
362
 
@@ -94,6 +94,7 @@ module Dynflow
94
94
  end
95
95
 
96
96
  describe "when being executed" do
97
+ include TestHelpers
97
98
 
98
99
  let :execution_plan do
99
100
  world.plan(Support::CodeWorkflowExample::IncomingIssue, { 'text' => 'get a break' })
@@ -128,6 +129,16 @@ module Dynflow
128
129
  assert_raises(Dynflow::Error) { world.execute(execution_plan.id).value! }
129
130
  end
130
131
  end
132
+
133
+ it "handles when the execution plan is deleted" do
134
+ TestPause.when_paused do
135
+ world.persistence.delete_execution_plans(uuid: [execution_plan.id])
136
+ end
137
+ director = get_director(world)
138
+ wait_for('execution plan removed from executor') do
139
+ !director.current_execution_plan_ids.include?(execution_plan.id)
140
+ end
141
+ end
131
142
  end
132
143
  end
133
144
 
@@ -162,15 +162,21 @@ module TestHelpers
162
162
  end
163
163
  end
164
164
 
165
+ # get director for deeper investigation of the current execution state
166
+ def get_director(world)
167
+ core_context = world.executor.instance_variable_get('@core').instance_variable_get('@core').context
168
+ core_context.instance_variable_get('@director')
169
+ end
170
+
165
171
  # waits for the passed block to return non-nil value and reiterates it while getting false
166
172
  # (till some reasonable timeout). Useful for forcing the tests for some event to occur
167
- def wait_for
173
+ def wait_for(waiting_message = 'something to happen')
168
174
  30.times do
169
175
  ret = yield
170
176
  return ret if ret
171
177
  sleep 0.3
172
178
  end
173
- raise 'waiting for something to happen was not successful'
179
+ raise "waiting for #{waiting_message} was not successful"
174
180
  end
175
181
 
176
182
  def executor_id_for_plan(execution_plan_id)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dynflow
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Necas
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-07-09 00:00:00.000000000 Z
12
+ date: 2018-10-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: multi_json
@@ -379,6 +379,7 @@ files:
379
379
  - examples/singletons.rb
380
380
  - examples/sub_plan_concurrency_control.rb
381
381
  - examples/sub_plans.rb
382
+ - extras/statsd_mapping.conf
382
383
  - lib/dynflow.rb
383
384
  - lib/dynflow/action.rb
384
385
  - lib/dynflow/action/cancellable.rb
@@ -408,6 +409,7 @@ files:
408
409
  - lib/dynflow/coordinator_adapters/abstract.rb
409
410
  - lib/dynflow/coordinator_adapters/sequel.rb
410
411
  - lib/dynflow/dead_letter_silencer.rb
412
+ - lib/dynflow/debug/telemetry/persistence.rb
411
413
  - lib/dynflow/delayed_executors.rb
412
414
  - lib/dynflow/delayed_executors/abstract.rb
413
415
  - lib/dynflow/delayed_executors/abstract_core.rb
@@ -500,6 +502,9 @@ files:
500
502
  - lib/dynflow/serializers/noop.rb
501
503
  - lib/dynflow/stateful.rb
502
504
  - lib/dynflow/telemetry.rb
505
+ - lib/dynflow/telemetry_adapters/abstract.rb
506
+ - lib/dynflow/telemetry_adapters/dummy.rb
507
+ - lib/dynflow/telemetry_adapters/statsd.rb
503
508
  - lib/dynflow/testing.rb
504
509
  - lib/dynflow/testing/assertions.rb
505
510
  - lib/dynflow/testing/dummy_execution_plan.rb