fractor 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +28 -91
- data/docs/ARCHITECTURE.md +317 -0
- data/docs/PERFORMANCE_TUNING.md +355 -0
- data/docs/TROUBLESHOOTING.md +463 -0
- data/lib/fractor/callback_registry.rb +106 -0
- data/lib/fractor/config_schema.rb +170 -0
- data/lib/fractor/main_loop_handler.rb +4 -8
- data/lib/fractor/main_loop_handler3.rb +10 -12
- data/lib/fractor/main_loop_handler4.rb +48 -20
- data/lib/fractor/result_cache.rb +58 -10
- data/lib/fractor/shutdown_handler.rb +12 -6
- data/lib/fractor/supervisor.rb +100 -13
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/workflow/execution/dependency_resolver.rb +149 -0
- data/lib/fractor/workflow/execution/fallback_job_handler.rb +68 -0
- data/lib/fractor/workflow/execution/job_executor.rb +242 -0
- data/lib/fractor/workflow/execution/result_builder.rb +76 -0
- data/lib/fractor/workflow/execution/workflow_execution_logger.rb +241 -0
- data/lib/fractor/workflow/workflow_executor.rb +97 -476
- data/lib/fractor/wrapped_ractor.rb +2 -4
- data/lib/fractor.rb +11 -0
- metadata +12 -2
data/lib/fractor/supervisor.rb
CHANGED
|
@@ -12,7 +12,7 @@ module Fractor
|
|
|
12
12
|
# Supervises multiple WrappedRactors, distributes work, and aggregates results.
|
|
13
13
|
class Supervisor
|
|
14
14
|
attr_reader :work_queue, :workers, :results, :worker_pools, :debug,
|
|
15
|
-
:error_reporter, :logger, :performance_monitor
|
|
15
|
+
:error_reporter, :logger, :performance_monitor, :callback_registry
|
|
16
16
|
|
|
17
17
|
# Initializes the Supervisor.
|
|
18
18
|
# - worker_pools: An array of worker pool configurations, each containing:
|
|
@@ -81,13 +81,14 @@ module Fractor
|
|
|
81
81
|
@ractors_map = {} # Map Ractor object to WrappedRactor instance
|
|
82
82
|
@continuous_mode = continuous_mode
|
|
83
83
|
@running = false
|
|
84
|
-
@work_callbacks = []
|
|
85
84
|
@wakeup_ractor = nil # Control ractor for unblocking select
|
|
86
85
|
@timer_thread = nil # Timer thread for periodic wakeup
|
|
87
86
|
@error_reporter = ErrorReporter.new # Track errors and statistics
|
|
88
|
-
@error_callbacks = [] # Custom error callbacks
|
|
89
87
|
@performance_monitor = nil # Performance monitor instance
|
|
90
88
|
|
|
89
|
+
# Initialize callback registry for managing work and error callbacks
|
|
90
|
+
@callback_registry = CallbackRegistry.new(debug: @debug)
|
|
91
|
+
|
|
91
92
|
# Initialize performance monitor if enabled
|
|
92
93
|
if enable_performance_monitoring
|
|
93
94
|
require_relative "performance_monitor"
|
|
@@ -112,6 +113,7 @@ module Fractor
|
|
|
112
113
|
@timer_thread,
|
|
113
114
|
@performance_monitor,
|
|
114
115
|
debug: @debug,
|
|
116
|
+
continuous_mode: @continuous_mode,
|
|
115
117
|
)
|
|
116
118
|
|
|
117
119
|
# Initialize signal handler for graceful shutdown
|
|
@@ -172,14 +174,14 @@ module Fractor
|
|
|
172
174
|
# Register a callback to provide new work items
|
|
173
175
|
# The callback should return nil or empty array when no new work is available
|
|
174
176
|
def register_work_source(&callback)
|
|
175
|
-
@
|
|
177
|
+
@callback_registry.register_work_source(&callback)
|
|
176
178
|
end
|
|
177
179
|
|
|
178
180
|
# Register a callback to handle errors
|
|
179
181
|
# The callback receives (error_result, worker_name, worker_class)
|
|
180
182
|
# Example: supervisor.on_error { |err, worker, klass| puts "Error in #{klass}: #{err.error}" }
|
|
181
183
|
def on_error(&callback)
|
|
182
|
-
@
|
|
184
|
+
@callback_registry.register_error_callback(&callback)
|
|
183
185
|
end
|
|
184
186
|
|
|
185
187
|
# Starts the worker Ractors for all worker pools.
|
|
@@ -188,11 +190,8 @@ module Fractor
|
|
|
188
190
|
# Pass as parameter to avoid isolation error
|
|
189
191
|
debug_mode = @debug
|
|
190
192
|
|
|
191
|
-
# Check if running on Ruby 4.0
|
|
192
|
-
ruby_4_0 = Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("4.0.0")
|
|
193
|
-
|
|
194
193
|
# Create a wakeup Ractor for unblocking Ractor.select
|
|
195
|
-
if
|
|
194
|
+
if Fractor::RUBY_4_0_OR_HIGHER
|
|
196
195
|
# In Ruby 4.0, wakeup uses ports too
|
|
197
196
|
@wakeup_port = Ractor::Port.new
|
|
198
197
|
@wakeup_ractor = Ractor.new(@wakeup_port, debug_mode) do |port, debug|
|
|
@@ -231,7 +230,7 @@ module Fractor
|
|
|
231
230
|
|
|
232
231
|
pool[:workers] = (1..num_workers).map do |i|
|
|
233
232
|
# In Ruby 4.0, create a response port for each worker
|
|
234
|
-
response_port = if
|
|
233
|
+
response_port = if Fractor::RUBY_4_0_OR_HIGHER
|
|
235
234
|
Ractor::Port.new
|
|
236
235
|
end
|
|
237
236
|
|
|
@@ -322,7 +321,9 @@ module Fractor
|
|
|
322
321
|
end
|
|
323
322
|
|
|
324
323
|
# Start timer thread for continuous mode to periodically check work sources
|
|
325
|
-
|
|
324
|
+
# CRITICAL: Always start timer thread in continuous mode to ensure main loop
|
|
325
|
+
# can periodically check for worker termination during shutdown
|
|
326
|
+
start_timer_thread if @continuous_mode
|
|
326
327
|
|
|
327
328
|
begin
|
|
328
329
|
# Run the main event loop through MainLoopHandler
|
|
@@ -361,6 +362,18 @@ module Fractor
|
|
|
361
362
|
|
|
362
363
|
@running = false
|
|
363
364
|
|
|
365
|
+
# CRITICAL: Send immediate wakeup signal to unblock main loop from Ractor.select
|
|
366
|
+
# This is especially important for Ruby 3.4+ where Ractor.select may block indefinitely
|
|
367
|
+
# without periodic checks of @running. The timer thread might take time to exit,
|
|
368
|
+
# so we send the signal here immediately.
|
|
369
|
+
if @wakeup_ractor
|
|
370
|
+
begin
|
|
371
|
+
@wakeup_ractor.send(:shutdown)
|
|
372
|
+
rescue StandardError => e
|
|
373
|
+
puts "Error sending shutdown to wakeup ractor: #{e.message}" if @debug
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
|
|
364
377
|
# Update shutdown handler with current references before shutdown
|
|
365
378
|
@shutdown_handler.instance_variable_set(:@workers, @workers)
|
|
366
379
|
@shutdown_handler.instance_variable_set(:@wakeup_ractor, @wakeup_ractor)
|
|
@@ -377,13 +390,29 @@ module Fractor
|
|
|
377
390
|
|
|
378
391
|
# Start the timer thread for continuous mode.
|
|
379
392
|
# This thread periodically wakes up the main loop to check for new work.
|
|
393
|
+
# CRITICAL: Always start the timer thread in continuous mode, even without callbacks,
|
|
394
|
+
# to ensure the main loop can periodically check for worker termination during shutdown.
|
|
380
395
|
#
|
|
381
396
|
# @return [void]
|
|
382
397
|
def start_timer_thread
|
|
383
398
|
@timer_thread = Thread.new do
|
|
384
|
-
|
|
399
|
+
# Keep running during shutdown to allow periodic checks for worker termination
|
|
400
|
+
# Only exit when @shutting_down is true AND workers are closed
|
|
401
|
+
loop do
|
|
385
402
|
sleep(0.1) # Check work sources every 100ms
|
|
386
|
-
|
|
403
|
+
|
|
404
|
+
# Exit if we're no longer running AND (not in continuous mode OR workers are closed)
|
|
405
|
+
break if !@running && (!@continuous_mode || workers.all?(&:closed?))
|
|
406
|
+
|
|
407
|
+
# Send wakeup signals if running, or during shutdown in continuous mode until workers close
|
|
408
|
+
should_send = if @running
|
|
409
|
+
@running
|
|
410
|
+
else
|
|
411
|
+
# During shutdown in continuous mode, keep sending until workers close
|
|
412
|
+
@continuous_mode && !workers.all?(&:closed?)
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
if @wakeup_ractor && should_send
|
|
387
416
|
begin
|
|
388
417
|
@wakeup_ractor.send(:wakeup)
|
|
389
418
|
rescue StandardError => e
|
|
@@ -550,5 +579,63 @@ module Fractor
|
|
|
550
579
|
|
|
551
580
|
@performance_monitor.snapshot
|
|
552
581
|
end
|
|
582
|
+
|
|
583
|
+
# Class-level documentation for Supervisor configuration options.
|
|
584
|
+
# Provides a summary of valid configuration parameters for the initialize method.
|
|
585
|
+
#
|
|
586
|
+
# @example Print configuration help
|
|
587
|
+
# puts Fractor::Supervisor.configuration_help
|
|
588
|
+
#
|
|
589
|
+
# @return [String] Configuration documentation
|
|
590
|
+
def self.configuration_help
|
|
591
|
+
<<~HELP
|
|
592
|
+
Fractor::Supervisor Configuration Options
|
|
593
|
+
==========================================
|
|
594
|
+
|
|
595
|
+
The Supervisor accepts the following keyword arguments to initialize():
|
|
596
|
+
|
|
597
|
+
worker_pools (Array, required)
|
|
598
|
+
Array of worker pool configuration hashes.
|
|
599
|
+
Each hash must contain:
|
|
600
|
+
- worker_class: Class inheriting from Fractor::Worker (required)
|
|
601
|
+
- num_workers: Positive integer for number of workers (optional, defaults to CPU count)
|
|
602
|
+
|
|
603
|
+
Example:
|
|
604
|
+
worker_pools: [
|
|
605
|
+
{ worker_class: MyWorker, num_workers: 4 },
|
|
606
|
+
{ worker_class: AnotherWorker, num_workers: 2 }
|
|
607
|
+
]
|
|
608
|
+
|
|
609
|
+
continuous_mode (Boolean, optional, default: false)
|
|
610
|
+
Whether to run in continuous mode (long-running) or batch mode.
|
|
611
|
+
- false: Batch mode - processes all work items and exits
|
|
612
|
+
- true: Continuous mode - runs until stopped, accepts work from callbacks
|
|
613
|
+
|
|
614
|
+
debug (Boolean, optional, default: false)
|
|
615
|
+
Enable verbose debug output for all state changes.
|
|
616
|
+
Can also be enabled via FRACTOR_DEBUG=1 environment variable.
|
|
617
|
+
|
|
618
|
+
logger (Logger, optional, default: Fractor.logger)
|
|
619
|
+
Optional logger instance for this Supervisor.
|
|
620
|
+
Provides isolation when multiple gems use Fractor in the same process.
|
|
621
|
+
|
|
622
|
+
tracer_enabled (Boolean, optional)
|
|
623
|
+
Override for ExecutionTracer. nil uses global setting.
|
|
624
|
+
|
|
625
|
+
tracer_stream (IO, optional)
|
|
626
|
+
Optional trace stream for this Supervisor. nil uses global setting.
|
|
627
|
+
|
|
628
|
+
enable_performance_monitoring (Boolean, optional, default: false)
|
|
629
|
+
Enable performance monitoring (latency, throughput, etc.).
|
|
630
|
+
When enabled, performance_metrics() returns current metrics.
|
|
631
|
+
|
|
632
|
+
Validation
|
|
633
|
+
----------
|
|
634
|
+
All configuration is validated at initialization time with detailed error messages.
|
|
635
|
+
Invalid configurations will raise ArgumentError with helpful fix suggestions.
|
|
636
|
+
|
|
637
|
+
For more information, see the Supervisor class documentation.
|
|
638
|
+
HELP
|
|
639
|
+
end
|
|
553
640
|
end
|
|
554
641
|
end
|
data/lib/fractor/version.rb
CHANGED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
require "digest"
|
|
5
|
+
|
|
6
|
+
module Fractor
|
|
7
|
+
class Workflow
|
|
8
|
+
# Computes the execution order for workflow jobs using topological sort.
|
|
9
|
+
# Jobs are grouped into levels where all jobs in a level can be executed
|
|
10
|
+
# in parallel (their dependencies are satisfied).
|
|
11
|
+
#
|
|
12
|
+
# Caches execution order based on job structure to avoid recomputing
|
|
13
|
+
# topological sort for static workflow definitions.
|
|
14
|
+
class DependencyResolver
|
|
15
|
+
# Class-level cache for execution orders.
|
|
16
|
+
# Keyed by workflow signature (hash of job structure).
|
|
17
|
+
@cache = {}
|
|
18
|
+
@mutex = Mutex.new
|
|
19
|
+
|
|
20
|
+
class << self
|
|
21
|
+
attr_reader :cache
|
|
22
|
+
|
|
23
|
+
# Clear the entire execution order cache.
|
|
24
|
+
# Useful for testing or when workflows are dynamically modified.
|
|
25
|
+
def clear_cache
|
|
26
|
+
@mutex.synchronize { @cache.clear }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Clear cache entries for a specific workflow.
|
|
30
|
+
#
|
|
31
|
+
# @param workflow_signature [String] The workflow signature to clear
|
|
32
|
+
def clear_cache_for(workflow_signature)
|
|
33
|
+
@mutex.synchronize { @cache.delete(workflow_signature) }
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Initialize the resolver with a workflow's jobs.
|
|
38
|
+
#
|
|
39
|
+
# @param jobs [Hash] Hash of job_name => Job objects
|
|
40
|
+
# @param enable_cache [Boolean] Whether to use cached execution order (default: true)
|
|
41
|
+
def initialize(jobs, enable_cache: true)
|
|
42
|
+
@jobs = jobs
|
|
43
|
+
@enable_cache = enable_cache
|
|
44
|
+
@signature = compute_signature if enable_cache
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Compute the execution order using topological sort.
|
|
48
|
+
# Returns an array of arrays, where each inner array contains job names
|
|
49
|
+
# that can be executed in parallel (their dependencies are satisfied).
|
|
50
|
+
#
|
|
51
|
+
# Results are cached based on the workflow's job structure (job names
|
|
52
|
+
# and their dependencies). This provides significant performance benefits
|
|
53
|
+
# for workflows that are executed multiple times.
|
|
54
|
+
#
|
|
55
|
+
# @return [Array<Array<String>>] Execution order as grouped job names
|
|
56
|
+
def execution_order
|
|
57
|
+
# Try to get from cache first
|
|
58
|
+
if @enable_cache && @signature && cached_execution_order
|
|
59
|
+
return cached_execution_order
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Compute the execution order
|
|
63
|
+
order = compute_order
|
|
64
|
+
|
|
65
|
+
# Cache the result
|
|
66
|
+
cache_execution_order(order) if @enable_cache && @signature
|
|
67
|
+
|
|
68
|
+
order
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Invalidate the cache for this workflow's execution order.
|
|
72
|
+
# Call this if the workflow definition changes dynamically.
|
|
73
|
+
def invalidate_cache
|
|
74
|
+
return unless @enable_cache && @signature
|
|
75
|
+
|
|
76
|
+
self.class.clear_cache_for(@signature)
|
|
77
|
+
@cached = false
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
# Get the cached execution order for this workflow.
|
|
83
|
+
#
|
|
84
|
+
# @return [Array<Array<String>>, nil] Cached execution order or nil
|
|
85
|
+
def cached_execution_order
|
|
86
|
+
self.class.cache[@signature]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Cache an execution order for this workflow.
|
|
90
|
+
#
|
|
91
|
+
# @param order [Array<Array<String>>] The execution order to cache
|
|
92
|
+
def cache_execution_order(order)
|
|
93
|
+
DependencyResolver.cache[@signature] = order
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Compute a unique signature for this workflow's job structure.
|
|
97
|
+
# The signature is based on job names and their dependencies.
|
|
98
|
+
#
|
|
99
|
+
# @return [String] A hash representing the workflow structure
|
|
100
|
+
def compute_signature
|
|
101
|
+
# Build a deterministic representation of the workflow structure
|
|
102
|
+
structure = {}
|
|
103
|
+
@jobs.each do |name, job|
|
|
104
|
+
structure[name] = {
|
|
105
|
+
dependencies: Array(job.dependencies).sort,
|
|
106
|
+
}
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Sort by job name for deterministic hashing
|
|
110
|
+
sorted_structure = structure.sort.to_h
|
|
111
|
+
|
|
112
|
+
# Generate SHA256 hash of the structure
|
|
113
|
+
Digest::SHA256.hexdigest(JSON.dump(sorted_structure))
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Compute the execution order using topological sort.
|
|
117
|
+
#
|
|
118
|
+
# @return [Array<Array<String>>] Execution order as grouped job names
|
|
119
|
+
def compute_order
|
|
120
|
+
order = []
|
|
121
|
+
remaining = @jobs.keys.to_set
|
|
122
|
+
processed = Set.new
|
|
123
|
+
|
|
124
|
+
until remaining.empty?
|
|
125
|
+
# Find jobs whose dependencies are all satisfied
|
|
126
|
+
ready = remaining.select do |job_name|
|
|
127
|
+
job = @jobs[job_name]
|
|
128
|
+
job.dependencies.all? { |dep| processed.include?(dep) }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
if ready.empty?
|
|
132
|
+
# This should not happen if validation was done correctly
|
|
133
|
+
raise WorkflowExecutionError,
|
|
134
|
+
"Cannot find next jobs to execute. Remaining: #{remaining.to_a.join(', ')}"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
order << ready
|
|
138
|
+
ready.each do |job_name|
|
|
139
|
+
processed.add(job_name)
|
|
140
|
+
remaining.delete(job_name)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
puts "Execution order: #{order.inspect}" if ENV["FRACTOR_DEBUG"]
|
|
145
|
+
order
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Fractor
|
|
4
|
+
class Workflow
|
|
5
|
+
# Handles fallback job execution when a primary job fails.
|
|
6
|
+
# Manages the lifecycle of executing a fallback job and integrating
|
|
7
|
+
# its result back into the workflow context.
|
|
8
|
+
class FallbackJobHandler
|
|
9
|
+
# Initialize the fallback handler.
|
|
10
|
+
#
|
|
11
|
+
# @param workflow [Workflow] The workflow instance
|
|
12
|
+
# @param context [WorkflowContext] The execution context
|
|
13
|
+
# @param hooks [ExecutionHooks] Execution hooks for event notification
|
|
14
|
+
# @param logger [WorkflowLogger] The workflow logger
|
|
15
|
+
def initialize(workflow, context, hooks, logger)
|
|
16
|
+
@workflow = workflow
|
|
17
|
+
@context = context
|
|
18
|
+
@hooks = hooks
|
|
19
|
+
@logger = logger
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Execute a fallback job for a failed job.
|
|
23
|
+
#
|
|
24
|
+
# @param original_job [Job] The job that failed
|
|
25
|
+
# @param original_error [Exception] The error that occurred
|
|
26
|
+
# @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
|
|
27
|
+
# @param job_executor [JobExecutor] The job executor to use
|
|
28
|
+
# @param start_time [Time] The original job start time (for duration calculation)
|
|
29
|
+
# @return [Object] The output from the fallback job
|
|
30
|
+
def execute_fallback(original_job, original_error, job_trace,
|
|
31
|
+
job_executor, start_time)
|
|
32
|
+
fallback_job_name = original_job.fallback_job
|
|
33
|
+
fallback_job = @workflow.class.jobs[fallback_job_name]
|
|
34
|
+
|
|
35
|
+
unless fallback_job
|
|
36
|
+
raise WorkflowExecutionError,
|
|
37
|
+
"Fallback job '#{fallback_job_name}' not found for job '#{original_job.name}'"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
@logger.fallback_execution(original_job.name, fallback_job.name,
|
|
41
|
+
original_error)
|
|
42
|
+
|
|
43
|
+
begin
|
|
44
|
+
# Execute fallback job using job_executor
|
|
45
|
+
output = job_executor.execute_once(fallback_job, job_trace)
|
|
46
|
+
|
|
47
|
+
# Store output under original job name as well
|
|
48
|
+
@context.store_job_output(original_job.name, output)
|
|
49
|
+
original_job.state(:completed)
|
|
50
|
+
|
|
51
|
+
duration = Time.now - start_time
|
|
52
|
+
|
|
53
|
+
# Update trace
|
|
54
|
+
job_trace&.complete!(output: output)
|
|
55
|
+
|
|
56
|
+
@logger.job_complete(original_job.name, duration)
|
|
57
|
+
@hooks.trigger(:job_complete, original_job, output, duration)
|
|
58
|
+
|
|
59
|
+
output
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
@logger.fallback_failed(original_job.name, fallback_job.name, e)
|
|
62
|
+
raise WorkflowExecutionError,
|
|
63
|
+
"Job '#{original_job.name}' and fallback '#{fallback_job_name}' both failed"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../../supervisor"
|
|
4
|
+
require_relative "../../work"
|
|
5
|
+
require_relative "../retry_orchestrator"
|
|
6
|
+
|
|
7
|
+
module Fractor
|
|
8
|
+
class Workflow
|
|
9
|
+
# Executes a single workflow job, handling all aspects of job execution
|
|
10
|
+
# including input building, work creation, and supervisor orchestration.
|
|
11
|
+
class JobExecutor
|
|
12
|
+
attr_reader :context, :logger, :dead_letter_queue
|
|
13
|
+
|
|
14
|
+
# Initialize the job executor.
|
|
15
|
+
#
|
|
16
|
+
# @param context [WorkflowContext] The workflow execution context
|
|
17
|
+
# @param logger [WorkflowLogger] The workflow logger
|
|
18
|
+
# @param workflow [Workflow] The workflow instance
|
|
19
|
+
# @param completed_jobs [Set<String>] Set of completed job names
|
|
20
|
+
# @param failed_jobs [Set<String>] Set of failed job names
|
|
21
|
+
# @param dead_letter_queue [DeadLetterQueue, nil] Optional DLQ for failed jobs
|
|
22
|
+
# @param circuit_breakers [CircuitBreakerRegistry] Circuit breaker registry
|
|
23
|
+
def initialize(context, logger, workflow: nil, completed_jobs: nil, failed_jobs: nil,
|
|
24
|
+
dead_letter_queue: nil, circuit_breakers: nil)
|
|
25
|
+
@context = context
|
|
26
|
+
@logger = logger
|
|
27
|
+
@workflow = workflow
|
|
28
|
+
@completed_jobs = completed_jobs || Set.new
|
|
29
|
+
@failed_jobs = failed_jobs || Set.new
|
|
30
|
+
@dead_letter_queue = dead_letter_queue
|
|
31
|
+
@circuit_breakers = circuit_breakers || CircuitBreakerRegistry.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Execute a job once (no retry logic).
|
|
35
|
+
#
|
|
36
|
+
# @param job [Job] The job to execute
|
|
37
|
+
# @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
|
|
38
|
+
# @return [Object] The job output
|
|
39
|
+
def execute_once(job, job_trace = nil)
|
|
40
|
+
# Build input for this job
|
|
41
|
+
job_input = @context.build_job_input(job)
|
|
42
|
+
job_trace&.set_input(job_input)
|
|
43
|
+
|
|
44
|
+
# Create work item - if job_input is already a Work object, use it directly
|
|
45
|
+
# to avoid double-wrapping (e.g., when using custom Work subclasses)
|
|
46
|
+
work = if job_input.is_a?(Work)
|
|
47
|
+
job_input
|
|
48
|
+
else
|
|
49
|
+
Work.new(job_input)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Execute with circuit breaker if configured
|
|
53
|
+
if job.circuit_breaker_enabled?
|
|
54
|
+
execute_with_circuit_breaker(job, work, job_trace)
|
|
55
|
+
else
|
|
56
|
+
execute_with_supervisor(job, work)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Execute a job with retry logic.
|
|
61
|
+
#
|
|
62
|
+
# @param job [Job] The job to execute
|
|
63
|
+
# @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
|
|
64
|
+
# @return [Object] The job output
|
|
65
|
+
def execute_with_retry(job, job_trace = nil)
|
|
66
|
+
retry_config = job.retry_config
|
|
67
|
+
|
|
68
|
+
# Create retry orchestrator with the job's retry configuration
|
|
69
|
+
orchestrator = RetryOrchestrator.new(retry_config,
|
|
70
|
+
debug: ENV["FRACTOR_DEBUG"] == "1")
|
|
71
|
+
|
|
72
|
+
# Execute with retry logic
|
|
73
|
+
orchestrator.execute_with_retry(job) do |j|
|
|
74
|
+
execute_once(j, job_trace)
|
|
75
|
+
end
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
# Get retry state for DLQ entry
|
|
78
|
+
retry_state = orchestrator.state
|
|
79
|
+
add_to_dead_letter_queue(job, e, retry_state)
|
|
80
|
+
raise e
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Execute a job using a supervisor.
|
|
84
|
+
#
|
|
85
|
+
# @param job [Job] The job to execute
|
|
86
|
+
# @param work [Work] The work item to process
|
|
87
|
+
# @return [Object] The job output
|
|
88
|
+
def execute_with_supervisor(job, work)
|
|
89
|
+
supervisor = Supervisor.new(
|
|
90
|
+
worker_pools: [
|
|
91
|
+
{
|
|
92
|
+
worker_class: job.worker_class,
|
|
93
|
+
num_workers: job.num_workers || 1,
|
|
94
|
+
},
|
|
95
|
+
],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
supervisor.add_work_item(work)
|
|
99
|
+
supervisor.run
|
|
100
|
+
|
|
101
|
+
# Check for errors first (before checking results)
|
|
102
|
+
unless supervisor.results.errors.empty?
|
|
103
|
+
error = supervisor.results.errors.first
|
|
104
|
+
raise WorkflowExecutionError,
|
|
105
|
+
"Job '#{job.name}' encountered error: #{error.error}"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get the result
|
|
109
|
+
results = supervisor.results.results
|
|
110
|
+
if results.empty?
|
|
111
|
+
raise WorkflowExecutionError, "Job '#{job.name}' produced no results"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
results.first.result
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Execute a job with circuit breaker protection.
|
|
118
|
+
#
|
|
119
|
+
# @param job [Job] The job to execute
|
|
120
|
+
# @param work [Work] The work item to process
|
|
121
|
+
# @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
|
|
122
|
+
# @return [Object] The job output
|
|
123
|
+
def execute_with_circuit_breaker(job, work, _job_trace = nil)
|
|
124
|
+
breaker_key = job.circuit_breaker_key
|
|
125
|
+
|
|
126
|
+
# Get or create circuit breaker orchestrator for this job
|
|
127
|
+
orchestrator = @circuit_breakers.get_or_create_orchestrator(
|
|
128
|
+
breaker_key,
|
|
129
|
+
**job.circuit_breaker_config.slice(:threshold, :timeout,
|
|
130
|
+
:half_open_calls),
|
|
131
|
+
job_name: job.name,
|
|
132
|
+
debug: ENV["FRACTOR_DEBUG"] == "1",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Log circuit state before execution
|
|
136
|
+
log_circuit_breaker_state(job, orchestrator)
|
|
137
|
+
|
|
138
|
+
begin
|
|
139
|
+
orchestrator.execute_with_breaker(job) do
|
|
140
|
+
execute_with_supervisor(job, work)
|
|
141
|
+
end
|
|
142
|
+
rescue Workflow::CircuitOpenError => e
|
|
143
|
+
log_circuit_breaker_open(job, orchestrator)
|
|
144
|
+
raise WorkflowExecutionError,
|
|
145
|
+
"Circuit breaker open for job '#{job.name}': #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
private
|
|
150
|
+
|
|
151
|
+
# Add failed job to dead letter queue.
|
|
152
|
+
#
|
|
153
|
+
# @param job [Job] The job that failed
|
|
154
|
+
# @param error [Exception] The error that occurred
|
|
155
|
+
# @param retry_state [Object, nil] Optional retry state
|
|
156
|
+
def add_to_dead_letter_queue(job, error, retry_state = nil)
|
|
157
|
+
return unless @dead_letter_queue
|
|
158
|
+
|
|
159
|
+
# Build job input for DLQ entry
|
|
160
|
+
job_input = @context.build_job_input(job)
|
|
161
|
+
work = Work.new(job_input)
|
|
162
|
+
|
|
163
|
+
# Build metadata about the failure
|
|
164
|
+
metadata = build_failure_metadata(job, error, retry_state)
|
|
165
|
+
|
|
166
|
+
# Build context from workflow
|
|
167
|
+
context = {
|
|
168
|
+
workflow_input: @context.workflow_input,
|
|
169
|
+
completed_jobs: @completed_jobs.to_a,
|
|
170
|
+
failed_jobs: @failed_jobs.to_a,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
@dead_letter_queue.add(work, error, context: context,
|
|
174
|
+
metadata: metadata)
|
|
175
|
+
|
|
176
|
+
@logger.added_to_dead_letter_queue(job.name, error,
|
|
177
|
+
@dead_letter_queue.size)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Build failure metadata for dead letter queue.
|
|
181
|
+
#
|
|
182
|
+
# @param job [Job] The job that failed
|
|
183
|
+
# @param error [Exception] The error that occurred
|
|
184
|
+
# @param retry_state [Object, nil] Optional retry state
|
|
185
|
+
# @return [Hash] Failure metadata
|
|
186
|
+
def build_failure_metadata(job, _error, retry_state)
|
|
187
|
+
metadata = {
|
|
188
|
+
job_name: job.name,
|
|
189
|
+
worker_class: job.worker_class.name,
|
|
190
|
+
correlation_id: @context.correlation_id,
|
|
191
|
+
workflow_name: @workflow.class.workflow_name,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# Add retry information if available
|
|
195
|
+
if retry_state
|
|
196
|
+
# Handle both RetryState object and Hash from orchestrator
|
|
197
|
+
if retry_state.is_a?(Hash)
|
|
198
|
+
# From RetryOrchestrator.state
|
|
199
|
+
metadata[:retry_attempts] = retry_state[:attempts] - 1
|
|
200
|
+
metadata[:max_attempts] = retry_state[:max_attempts]
|
|
201
|
+
metadata[:last_error] = retry_state[:last_error]
|
|
202
|
+
metadata[:total_retry_time] = retry_state[:total_time]
|
|
203
|
+
metadata[:all_errors] = retry_state[:all_errors]
|
|
204
|
+
else
|
|
205
|
+
# From RetryState object
|
|
206
|
+
metadata[:retry_attempts] = retry_state.attempt - 1
|
|
207
|
+
metadata[:total_retry_time] = retry_state.total_time
|
|
208
|
+
metadata[:all_errors] = retry_state.summary[:errors]
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
metadata
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Log circuit breaker state.
|
|
216
|
+
#
|
|
217
|
+
# @param job [Job] The job
|
|
218
|
+
# @param orchestrator [CircuitBreakerOrchestrator] The circuit breaker orchestrator
|
|
219
|
+
def log_circuit_breaker_state(job, orchestrator)
|
|
220
|
+
@logger.circuit_breaker_state(
|
|
221
|
+
job.name,
|
|
222
|
+
orchestrator.state,
|
|
223
|
+
failure_count: orchestrator.failure_count,
|
|
224
|
+
threshold: orchestrator.breaker.threshold,
|
|
225
|
+
)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Log circuit breaker open.
|
|
229
|
+
#
|
|
230
|
+
# @param job [Job] The job
|
|
231
|
+
# @param orchestrator [CircuitBreakerOrchestrator] The circuit breaker orchestrator
|
|
232
|
+
def log_circuit_breaker_open(job, orchestrator)
|
|
233
|
+
@logger.circuit_breaker_open(
|
|
234
|
+
job.name,
|
|
235
|
+
orchestrator.failure_count,
|
|
236
|
+
orchestrator.breaker.threshold,
|
|
237
|
+
last_failure: orchestrator.breaker.last_failure_time,
|
|
238
|
+
)
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|