fractor 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +28 -91
- data/docs/ARCHITECTURE.md +317 -0
- data/docs/PERFORMANCE_TUNING.md +355 -0
- data/docs/TROUBLESHOOTING.md +463 -0
- data/lib/fractor/callback_registry.rb +106 -0
- data/lib/fractor/config_schema.rb +170 -0
- data/lib/fractor/main_loop_handler.rb +4 -8
- data/lib/fractor/main_loop_handler3.rb +10 -12
- data/lib/fractor/main_loop_handler4.rb +48 -20
- data/lib/fractor/result_cache.rb +58 -10
- data/lib/fractor/shutdown_handler.rb +12 -6
- data/lib/fractor/supervisor.rb +100 -13
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/workflow/execution/dependency_resolver.rb +149 -0
- data/lib/fractor/workflow/execution/fallback_job_handler.rb +68 -0
- data/lib/fractor/workflow/execution/job_executor.rb +242 -0
- data/lib/fractor/workflow/execution/result_builder.rb +76 -0
- data/lib/fractor/workflow/execution/workflow_execution_logger.rb +241 -0
- data/lib/fractor/workflow/workflow_executor.rb +97 -476
- data/lib/fractor/wrapped_ractor.rb +2 -4
- data/lib/fractor.rb +11 -0
- metadata +12 -2
|
@@ -2,26 +2,36 @@
|
|
|
2
2
|
|
|
3
3
|
require "set"
|
|
4
4
|
require_relative "retry_config"
|
|
5
|
-
require_relative "circuit_breaker"
|
|
6
5
|
require_relative "circuit_breaker_registry"
|
|
7
|
-
require_relative "circuit_breaker_orchestrator"
|
|
8
|
-
require_relative "retry_orchestrator"
|
|
9
6
|
require_relative "pre_execution_context"
|
|
10
7
|
require_relative "execution_hooks"
|
|
11
8
|
require_relative "workflow_result"
|
|
9
|
+
require_relative "execution/dependency_resolver"
|
|
10
|
+
require_relative "execution/workflow_execution_logger"
|
|
11
|
+
require_relative "execution/job_executor"
|
|
12
|
+
require_relative "execution/fallback_job_handler"
|
|
13
|
+
require_relative "execution/result_builder"
|
|
12
14
|
|
|
13
15
|
module Fractor
|
|
14
16
|
class Workflow
|
|
15
17
|
# Orchestrates workflow execution by managing job execution order and data flow.
|
|
18
|
+
# Refactored to use focused helper classes for each responsibility.
|
|
16
19
|
class WorkflowExecutor
|
|
17
20
|
attr_reader :workflow, :context, :completed_jobs, :failed_jobs,
|
|
18
|
-
:trace, :hooks, :pre_execution_context
|
|
21
|
+
:trace, :hooks, :pre_execution_context, :job_executor
|
|
19
22
|
|
|
23
|
+
# Initialize the workflow executor.
|
|
24
|
+
#
|
|
25
|
+
# @param workflow [Workflow] The workflow instance to execute
|
|
26
|
+
# @param input [Object] The input data for the workflow
|
|
27
|
+
# @param correlation_id [String, nil] Optional correlation ID for tracking
|
|
28
|
+
# @param logger [Logger, nil] Optional logger instance
|
|
29
|
+
# @param trace [Boolean] Whether to enable execution tracing
|
|
30
|
+
# @param dead_letter_queue [DeadLetterQueue, nil] Optional dead letter queue
|
|
20
31
|
def initialize(workflow, input, correlation_id: nil, logger: nil,
|
|
21
|
-
trace: false, dead_letter_queue: nil)
|
|
32
|
+
trace: false, dead_letter_queue: nil)
|
|
22
33
|
@workflow = workflow
|
|
23
34
|
@correlation_id = correlation_id
|
|
24
|
-
@logger = logger
|
|
25
35
|
@context = WorkflowContext.new(
|
|
26
36
|
input,
|
|
27
37
|
correlation_id: correlation_id,
|
|
@@ -31,9 +41,20 @@ trace: false, dead_letter_queue: nil)
|
|
|
31
41
|
@failed_jobs = Set.new
|
|
32
42
|
@hooks = ExecutionHooks.new
|
|
33
43
|
@trace = trace ? create_trace : nil
|
|
34
|
-
@circuit_breakers =
|
|
44
|
+
@circuit_breakers = CircuitBreakerRegistry.new
|
|
35
45
|
@dead_letter_queue = dead_letter_queue
|
|
36
46
|
@pre_execution_context = PreExecutionContext.new(workflow, input)
|
|
47
|
+
|
|
48
|
+
# Initialize helper classes
|
|
49
|
+
@logger = WorkflowExecutionLogger.new(logger)
|
|
50
|
+
@job_executor = JobExecutor.new(@context, @logger,
|
|
51
|
+
workflow: workflow,
|
|
52
|
+
completed_jobs: @completed_jobs,
|
|
53
|
+
failed_jobs: @failed_jobs,
|
|
54
|
+
dead_letter_queue: @dead_letter_queue,
|
|
55
|
+
circuit_breakers: @circuit_breakers)
|
|
56
|
+
@fallback_handler = FallbackJobHandler.new(@workflow, @context, @hooks,
|
|
57
|
+
@logger)
|
|
37
58
|
end
|
|
38
59
|
|
|
39
60
|
# Execute the workflow and return the result.
|
|
@@ -43,14 +64,16 @@ trace: false, dead_letter_queue: nil)
|
|
|
43
64
|
# Run pre-execution validation
|
|
44
65
|
@pre_execution_context.validate!
|
|
45
66
|
|
|
46
|
-
|
|
47
|
-
|
|
67
|
+
@logger.workflow_start(@workflow.class.workflow_name,
|
|
68
|
+
@context.correlation_id)
|
|
69
|
+
@hooks.trigger(:workflow_start, @workflow)
|
|
48
70
|
@trace&.start_job(
|
|
49
71
|
job_name: "workflow",
|
|
50
|
-
worker_class: workflow.class.name,
|
|
72
|
+
worker_class: @workflow.class.name,
|
|
51
73
|
)
|
|
52
74
|
|
|
53
|
-
|
|
75
|
+
resolver = DependencyResolver.new(@workflow.class.jobs)
|
|
76
|
+
execution_order = resolver.execution_order
|
|
54
77
|
start_time = Time.now
|
|
55
78
|
|
|
56
79
|
execution_order.each do |job_group|
|
|
@@ -61,13 +84,19 @@ trace: false, dead_letter_queue: nil)
|
|
|
61
84
|
end_time = Time.now
|
|
62
85
|
@trace&.complete!
|
|
63
86
|
|
|
64
|
-
|
|
65
|
-
|
|
87
|
+
@logger.workflow_complete(@workflow.class.workflow_name,
|
|
88
|
+
end_time - start_time,
|
|
89
|
+
jobs_completed: @completed_jobs.size,
|
|
90
|
+
jobs_failed: @failed_jobs.size)
|
|
91
|
+
|
|
92
|
+
result_builder = ResultBuilder.new(@workflow, @context, @completed_jobs,
|
|
93
|
+
@failed_jobs, trace: @trace)
|
|
94
|
+
result = result_builder.build(start_time, end_time)
|
|
66
95
|
@hooks.trigger(:workflow_complete, result)
|
|
67
96
|
result
|
|
68
97
|
end
|
|
69
98
|
|
|
70
|
-
# Register a hook for workflow/job lifecycle events
|
|
99
|
+
# Register a hook for workflow/job lifecycle events.
|
|
71
100
|
#
|
|
72
101
|
# @param event [Symbol] The event to hook into
|
|
73
102
|
# @param block [Proc] The callback to execute
|
|
@@ -93,41 +122,12 @@ trace: false, dead_letter_queue: nil)
|
|
|
93
122
|
|
|
94
123
|
private
|
|
95
124
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
jobs = workflow.class.jobs
|
|
100
|
-
order = []
|
|
101
|
-
remaining = jobs.keys.to_set
|
|
102
|
-
processed = Set.new
|
|
103
|
-
|
|
104
|
-
until remaining.empty?
|
|
105
|
-
# Find jobs whose dependencies are all satisfied
|
|
106
|
-
ready = remaining.select do |job_name|
|
|
107
|
-
job = jobs[job_name]
|
|
108
|
-
job.dependencies.all? { |dep| processed.include?(dep) }
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
if ready.empty?
|
|
112
|
-
# This should not happen if validation was done correctly
|
|
113
|
-
raise WorkflowExecutionError,
|
|
114
|
-
"Cannot find next jobs to execute. Remaining: #{remaining.to_a.join(', ')}"
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
order << ready
|
|
118
|
-
ready.each do |job_name|
|
|
119
|
-
processed.add(job_name)
|
|
120
|
-
remaining.delete(job_name)
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
puts "Execution order: #{order.inspect}" if ENV["FRACTOR_DEBUG"]
|
|
125
|
-
order
|
|
126
|
-
end
|
|
127
|
-
|
|
125
|
+
# Execute a group of jobs (can be run in parallel).
|
|
126
|
+
#
|
|
127
|
+
# @param job_names [Array<String>] Names of jobs to execute
|
|
128
128
|
def execute_job_group(job_names)
|
|
129
129
|
puts "Executing job group: #{job_names.inspect}" if ENV["FRACTOR_DEBUG"]
|
|
130
|
-
jobs = job_names.map { |name| workflow.class.jobs[name] }
|
|
130
|
+
jobs = job_names.map { |name| @workflow.class.jobs[name] }
|
|
131
131
|
|
|
132
132
|
# Filter jobs based on conditions
|
|
133
133
|
executable_jobs = jobs.select { |job| job.should_execute?(@context) }
|
|
@@ -146,12 +146,13 @@ trace: false, dead_letter_queue: nil)
|
|
|
146
146
|
else
|
|
147
147
|
# Multiple jobs - execute sequentially (not parallel to avoid Ractor issues)
|
|
148
148
|
puts "Executing #{executable_jobs.size} jobs sequentially" if ENV["FRACTOR_DEBUG"]
|
|
149
|
-
executable_jobs.each
|
|
150
|
-
execute_job(job)
|
|
151
|
-
end
|
|
149
|
+
executable_jobs.each { |job| execute_job(job) }
|
|
152
150
|
end
|
|
153
151
|
end
|
|
154
152
|
|
|
153
|
+
# Execute a single job with all its lifecycle management.
|
|
154
|
+
#
|
|
155
|
+
# @param job [Job] The job to execute
|
|
155
156
|
def execute_job(job)
|
|
156
157
|
puts "Executing job: #{job.name}" if ENV["FRACTOR_DEBUG"]
|
|
157
158
|
job.state(:running)
|
|
@@ -163,7 +164,7 @@ trace: false, dead_letter_queue: nil)
|
|
|
163
164
|
)
|
|
164
165
|
|
|
165
166
|
# Log and trigger hook
|
|
166
|
-
|
|
167
|
+
@logger.job_start(job.name, job.worker_class.name)
|
|
167
168
|
@hooks.trigger(:job_start, job, @context)
|
|
168
169
|
|
|
169
170
|
start_time = Time.now
|
|
@@ -171,9 +172,9 @@ trace: false, dead_letter_queue: nil)
|
|
|
171
172
|
begin
|
|
172
173
|
# Execute with retry logic if configured
|
|
173
174
|
output = if job.retry_enabled?
|
|
174
|
-
|
|
175
|
+
@job_executor.execute_with_retry(job, job_trace)
|
|
175
176
|
else
|
|
176
|
-
|
|
177
|
+
@job_executor.execute_once(job, job_trace)
|
|
177
178
|
end
|
|
178
179
|
|
|
179
180
|
# Calculate duration
|
|
@@ -188,7 +189,7 @@ trace: false, dead_letter_queue: nil)
|
|
|
188
189
|
job_trace&.complete!(output: output)
|
|
189
190
|
|
|
190
191
|
# Log and trigger hook
|
|
191
|
-
|
|
192
|
+
@logger.job_complete(job.name, duration)
|
|
192
193
|
@hooks.trigger(:job_complete, job, output, duration)
|
|
193
194
|
|
|
194
195
|
puts "Job '#{job.name}' completed successfully" if ENV["FRACTOR_DEBUG"]
|
|
@@ -204,465 +205,85 @@ trace: false, dead_letter_queue: nil)
|
|
|
204
205
|
job.handle_error(e, @context)
|
|
205
206
|
|
|
206
207
|
# Log and trigger hook
|
|
207
|
-
|
|
208
|
+
@logger.job_error(job.name, e, has_fallback: !!job.fallback_job)
|
|
208
209
|
@hooks.trigger(:job_error, job, e, @context)
|
|
209
210
|
|
|
210
211
|
puts "Job '#{job.name}' failed: #{e.message}" if ENV["FRACTOR_DEBUG"]
|
|
211
212
|
|
|
212
213
|
# Try fallback job if configured
|
|
213
214
|
if job.fallback_job
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
"Job '#{job.name}' failed: #{e.message}\n#{e.backtrace.join("\n")}"
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
def execute_job_once(job, job_trace)
|
|
223
|
-
# Build input for this job
|
|
224
|
-
job_input = @context.build_job_input(job)
|
|
225
|
-
job_trace&.set_input(job_input)
|
|
226
|
-
|
|
227
|
-
# Create work item - if job_input is already a Work object, use it directly
|
|
228
|
-
# to avoid double-wrapping (e.g., when using custom Work subclasses)
|
|
229
|
-
work = if job_input.is_a?(Work)
|
|
230
|
-
job_input
|
|
231
|
-
else
|
|
232
|
-
Work.new(job_input)
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
# Execute with circuit breaker if configured
|
|
236
|
-
if job.circuit_breaker_enabled?
|
|
237
|
-
execute_with_circuit_breaker(job, work)
|
|
238
|
-
else
|
|
239
|
-
execute_job_with_supervisor(job, work)
|
|
240
|
-
end
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
def execute_job_with_retry(job, job_trace)
|
|
244
|
-
retry_config = job.retry_config
|
|
245
|
-
|
|
246
|
-
# Create retry orchestrator with the job's retry configuration
|
|
247
|
-
orchestrator = RetryOrchestrator.new(retry_config,
|
|
248
|
-
debug: ENV["FRACTOR_DEBUG"] == "1")
|
|
249
|
-
|
|
250
|
-
# Execute with retry logic
|
|
251
|
-
orchestrator.execute_with_retry(job) do |j|
|
|
252
|
-
execute_job_once(j, job_trace)
|
|
253
|
-
end
|
|
254
|
-
rescue StandardError => e
|
|
255
|
-
# Get retry state for DLQ entry
|
|
256
|
-
retry_state = orchestrator.state
|
|
257
|
-
add_to_dead_letter_queue(job, e, retry_state)
|
|
258
|
-
raise e
|
|
259
|
-
end
|
|
260
|
-
|
|
261
|
-
def execute_fallback_job(job, error, start_time, job_trace)
|
|
262
|
-
fallback_job_name = job.fallback_job
|
|
263
|
-
fallback_job = workflow.class.jobs[fallback_job_name]
|
|
264
|
-
|
|
265
|
-
unless fallback_job
|
|
266
|
-
raise WorkflowExecutionError,
|
|
267
|
-
"Fallback job '#{fallback_job_name}' not found for job '#{job.name}'"
|
|
268
|
-
end
|
|
269
|
-
|
|
270
|
-
log_fallback_execution(job, fallback_job, error)
|
|
271
|
-
|
|
272
|
-
begin
|
|
273
|
-
# Execute fallback job
|
|
274
|
-
execute_job(fallback_job)
|
|
275
|
-
|
|
276
|
-
# Use fallback job's output
|
|
277
|
-
output = @context.job_output(fallback_job_name)
|
|
278
|
-
duration = Time.now - start_time
|
|
279
|
-
|
|
280
|
-
# Store output under original job name as well
|
|
281
|
-
@context.store_job_output(job.name, output)
|
|
282
|
-
@completed_jobs.add(job.name)
|
|
283
|
-
job.state(:completed)
|
|
284
|
-
|
|
285
|
-
# Update trace
|
|
286
|
-
job_trace&.complete!(output: output)
|
|
287
|
-
|
|
288
|
-
log_job_complete(job, duration)
|
|
289
|
-
@hooks.trigger(:job_complete, job, output, duration)
|
|
290
|
-
rescue StandardError => e
|
|
291
|
-
log_fallback_failed(job, fallback_job, e)
|
|
292
|
-
raise WorkflowExecutionError,
|
|
293
|
-
"Job '#{job.name}' and fallback '#{fallback_job_name}' both failed"
|
|
294
|
-
end
|
|
295
|
-
end
|
|
296
|
-
|
|
297
|
-
def execute_jobs_parallel(jobs)
|
|
298
|
-
puts "Executing #{jobs.size} jobs in parallel: #{jobs.map(&:name).join(', ')}" if ENV["FRACTOR_DEBUG"]
|
|
299
|
-
|
|
300
|
-
# Create supervisors for each job
|
|
301
|
-
supervisors = jobs.map do |job|
|
|
302
|
-
job.state(:running)
|
|
303
|
-
job_input = @context.build_job_input(job)
|
|
304
|
-
work = Work.new(job_input)
|
|
305
|
-
|
|
306
|
-
supervisor = Supervisor.new(
|
|
307
|
-
worker_pools: [
|
|
308
|
-
{
|
|
309
|
-
worker_class: job.worker_class,
|
|
310
|
-
num_workers: job.num_workers || 1,
|
|
311
|
-
},
|
|
312
|
-
],
|
|
313
|
-
)
|
|
314
|
-
supervisor.add_work_item(work)
|
|
315
|
-
|
|
316
|
-
{ job: job, supervisor: supervisor }
|
|
317
|
-
end
|
|
318
|
-
|
|
319
|
-
# Run all supervisors in parallel using threads
|
|
320
|
-
threads = supervisors.map do |spec|
|
|
321
|
-
Thread.new do
|
|
322
|
-
spec[:supervisor].run
|
|
323
|
-
{ job: spec[:job], success: true, supervisor: spec[:supervisor] }
|
|
324
|
-
rescue StandardError => e
|
|
325
|
-
{ job: spec[:job], success: false, error: e }
|
|
326
|
-
end
|
|
327
|
-
end
|
|
328
|
-
|
|
329
|
-
# Wait for all to complete and process results
|
|
330
|
-
threads.each do |thread|
|
|
331
|
-
result = thread.value
|
|
332
|
-
job = result[:job]
|
|
333
|
-
|
|
334
|
-
if result[:success]
|
|
335
|
-
# Extract output from supervisor results
|
|
336
|
-
job_results = result[:supervisor].results.results
|
|
337
|
-
if job_results.empty?
|
|
338
|
-
raise WorkflowExecutionError,
|
|
339
|
-
"Job '#{job.name}' produced no results"
|
|
340
|
-
end
|
|
341
|
-
|
|
342
|
-
output = job_results.first.result
|
|
343
|
-
@context.store_job_output(job.name, output)
|
|
215
|
+
@fallback_handler.execute_fallback(job, e, job_trace,
|
|
216
|
+
@job_executor, start_time)
|
|
217
|
+
# Fallback succeeded - add original job to completed
|
|
344
218
|
@completed_jobs.add(job.name)
|
|
345
|
-
job.state(:completed)
|
|
346
|
-
|
|
347
|
-
puts "Job '#{job.name}' completed successfully" if ENV["FRACTOR_DEBUG"]
|
|
348
219
|
else
|
|
349
|
-
@failed_jobs.add(job.name)
|
|
350
|
-
job.state(:failed)
|
|
351
|
-
error = result[:error]
|
|
352
|
-
puts "Job '#{job.name}' failed: #{error.message}" if ENV["FRACTOR_DEBUG"]
|
|
353
220
|
raise WorkflowExecutionError,
|
|
354
|
-
"Job '#{job.name}' failed: #{
|
|
221
|
+
"Job '#{job.name}' failed: #{e.message}\n#{e.backtrace.join("\n")}"
|
|
355
222
|
end
|
|
356
223
|
end
|
|
357
224
|
end
|
|
358
225
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
{
|
|
363
|
-
worker_class: job.worker_class,
|
|
364
|
-
num_workers: job.num_workers || 1,
|
|
365
|
-
},
|
|
366
|
-
],
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
supervisor.add_work_item(work)
|
|
370
|
-
supervisor.run
|
|
371
|
-
|
|
372
|
-
# Check for errors first (before checking results)
|
|
373
|
-
unless supervisor.results.errors.empty?
|
|
374
|
-
error = supervisor.results.errors.first
|
|
375
|
-
raise WorkflowExecutionError,
|
|
376
|
-
"Job '#{job.name}' encountered error: #{error.error}"
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
# Get the result
|
|
380
|
-
results = supervisor.results.results
|
|
381
|
-
if results.empty?
|
|
382
|
-
raise WorkflowExecutionError, "Job '#{job.name}' produced no results"
|
|
383
|
-
end
|
|
384
|
-
|
|
385
|
-
results.first.result
|
|
386
|
-
end
|
|
387
|
-
|
|
226
|
+
# Check if the workflow should terminate early.
|
|
227
|
+
#
|
|
228
|
+
# @return [Boolean] true if workflow should terminate
|
|
388
229
|
def workflow_terminated?
|
|
389
230
|
# Check if any terminating job has completed
|
|
390
|
-
workflow.class.jobs.each do |name, job|
|
|
231
|
+
@workflow.class.jobs.each do |name, job|
|
|
391
232
|
return true if job.terminates && @completed_jobs.include?(name)
|
|
392
233
|
end
|
|
393
234
|
false
|
|
394
235
|
end
|
|
395
236
|
|
|
237
|
+
# Create an execution trace.
|
|
238
|
+
#
|
|
239
|
+
# @return [ExecutionTrace] The execution trace
|
|
396
240
|
def create_trace
|
|
397
241
|
require "securerandom"
|
|
398
242
|
execution_id = "exec-#{SecureRandom.hex(8)}"
|
|
399
243
|
ExecutionTrace.new(
|
|
400
|
-
workflow_name: workflow.class.workflow_name,
|
|
244
|
+
workflow_name: @workflow.class.workflow_name,
|
|
401
245
|
execution_id: execution_id,
|
|
402
246
|
correlation_id: @context.correlation_id,
|
|
403
247
|
)
|
|
404
248
|
end
|
|
405
249
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
workflow: workflow.class.workflow_name,
|
|
412
|
-
correlation_id: @context.correlation_id,
|
|
413
|
-
)
|
|
414
|
-
end
|
|
415
|
-
|
|
416
|
-
def log_workflow_complete(duration)
|
|
417
|
-
return unless @context.logger
|
|
418
|
-
|
|
419
|
-
@context.logger.info(
|
|
420
|
-
"Workflow complete",
|
|
421
|
-
workflow: workflow.class.workflow_name,
|
|
422
|
-
duration_ms: (duration * 1000).round(2),
|
|
423
|
-
jobs_completed: @completed_jobs.size,
|
|
424
|
-
jobs_failed: @failed_jobs.size,
|
|
425
|
-
)
|
|
426
|
-
end
|
|
427
|
-
|
|
428
|
-
def log_job_start(job)
|
|
429
|
-
return unless @context.logger
|
|
430
|
-
|
|
431
|
-
@context.logger.info(
|
|
432
|
-
"Job starting",
|
|
433
|
-
job: job.name,
|
|
434
|
-
worker: job.worker_class.name,
|
|
435
|
-
)
|
|
436
|
-
end
|
|
437
|
-
|
|
438
|
-
def log_job_complete(job, duration)
|
|
439
|
-
return unless @context.logger
|
|
440
|
-
|
|
441
|
-
@context.logger.info(
|
|
442
|
-
"Job complete",
|
|
443
|
-
job: job.name,
|
|
444
|
-
duration_ms: (duration * 1000).round(2),
|
|
445
|
-
)
|
|
446
|
-
end
|
|
447
|
-
|
|
448
|
-
def log_job_error(job, error, has_fallback: false)
|
|
449
|
-
return unless @context.logger
|
|
450
|
-
|
|
451
|
-
# Log at WARN level if fallback is available (error is handled),
|
|
452
|
-
# otherwise log at ERROR level (error causes workflow failure)
|
|
453
|
-
log_method = has_fallback ? @context.logger.method(:warn) : @context.logger.method(:error)
|
|
454
|
-
|
|
455
|
-
log_method.call(
|
|
456
|
-
"Job '#{job.name}' encountered error: #{error}",
|
|
457
|
-
job: job.name,
|
|
458
|
-
error: error.class.name,
|
|
459
|
-
)
|
|
460
|
-
end
|
|
461
|
-
|
|
462
|
-
def log_retry_attempt(job, retry_state, delay)
|
|
463
|
-
return unless @context.logger
|
|
464
|
-
|
|
465
|
-
@context.logger.warn(
|
|
466
|
-
"Job retry attempt",
|
|
467
|
-
job: job.name,
|
|
468
|
-
attempt: retry_state.attempt,
|
|
469
|
-
max_attempts: job.retry_config.max_attempts,
|
|
470
|
-
delay_seconds: delay,
|
|
471
|
-
last_error: retry_state.last_error&.message,
|
|
472
|
-
)
|
|
473
|
-
end
|
|
474
|
-
|
|
475
|
-
def log_retry_success(job, retry_state)
|
|
476
|
-
return unless @context.logger
|
|
477
|
-
|
|
478
|
-
@context.logger.info(
|
|
479
|
-
"Job retry succeeded",
|
|
480
|
-
job: job.name,
|
|
481
|
-
successful_attempt: retry_state.attempt,
|
|
482
|
-
total_attempts: retry_state.attempt,
|
|
483
|
-
total_time: retry_state.total_time,
|
|
484
|
-
)
|
|
485
|
-
end
|
|
486
|
-
|
|
487
|
-
def log_retry_exhausted(job, retry_state)
|
|
488
|
-
return unless @context.logger
|
|
489
|
-
|
|
490
|
-
@context.logger.error(
|
|
491
|
-
"Job retry attempts exhausted",
|
|
492
|
-
job: job.name,
|
|
493
|
-
total_attempts: retry_state.attempt - 1,
|
|
494
|
-
total_time: retry_state.total_time,
|
|
495
|
-
errors: retry_state.summary[:errors],
|
|
496
|
-
)
|
|
497
|
-
end
|
|
498
|
-
|
|
499
|
-
def log_fallback_execution(job, fallback_job, error)
|
|
500
|
-
return unless @context.logger
|
|
501
|
-
|
|
502
|
-
@context.logger.warn(
|
|
503
|
-
"Executing fallback job",
|
|
504
|
-
job: job.name,
|
|
505
|
-
fallback_job: fallback_job.name,
|
|
506
|
-
original_error: error.message,
|
|
507
|
-
)
|
|
508
|
-
end
|
|
509
|
-
|
|
510
|
-
def log_fallback_failed(job, fallback_job, error)
|
|
511
|
-
return unless @context.logger
|
|
512
|
-
|
|
513
|
-
@context.logger.error(
|
|
514
|
-
"Fallback job failed",
|
|
515
|
-
job: job.name,
|
|
516
|
-
fallback_job: fallback_job.name,
|
|
517
|
-
error: error.message,
|
|
518
|
-
)
|
|
519
|
-
end
|
|
520
|
-
|
|
521
|
-
def execute_with_circuit_breaker(job, work)
|
|
522
|
-
breaker_key = job.circuit_breaker_key
|
|
523
|
-
|
|
524
|
-
# Get or create circuit breaker orchestrator for this job
|
|
525
|
-
orchestrator = @circuit_breakers.get_or_create_orchestrator(
|
|
526
|
-
breaker_key,
|
|
527
|
-
**job.circuit_breaker_config.slice(:threshold, :timeout,
|
|
528
|
-
:half_open_calls),
|
|
529
|
-
job_name: job.name,
|
|
530
|
-
debug: ENV["FRACTOR_DEBUG"] == "1",
|
|
531
|
-
)
|
|
532
|
-
|
|
533
|
-
# Log circuit state before execution
|
|
534
|
-
log_circuit_breaker_state(job, orchestrator)
|
|
535
|
-
|
|
536
|
-
begin
|
|
537
|
-
orchestrator.execute_with_breaker(job) do
|
|
538
|
-
execute_job_with_supervisor(job, work)
|
|
539
|
-
end
|
|
540
|
-
rescue Workflow::CircuitOpenError => e
|
|
541
|
-
log_circuit_breaker_open(job, orchestrator)
|
|
542
|
-
raise WorkflowExecutionError,
|
|
543
|
-
"Circuit breaker open for job '#{job.name}': #{e.message}"
|
|
544
|
-
end
|
|
545
|
-
end
|
|
546
|
-
|
|
547
|
-
def log_circuit_breaker_state(job, breaker)
|
|
548
|
-
return unless @context.logger
|
|
549
|
-
return if breaker.closed?
|
|
550
|
-
|
|
551
|
-
@context.logger.warn(
|
|
552
|
-
"Circuit breaker state",
|
|
553
|
-
job: job.name,
|
|
554
|
-
state: breaker.state,
|
|
555
|
-
failure_count: breaker.failure_count,
|
|
556
|
-
threshold: breaker.threshold,
|
|
557
|
-
)
|
|
250
|
+
# Backward compatibility: Access dead letter queue.
|
|
251
|
+
#
|
|
252
|
+
# @return [DeadLetterQueue, nil] The DLQ or nil
|
|
253
|
+
def dead_letter_queue
|
|
254
|
+
@dead_letter_queue
|
|
558
255
|
end
|
|
559
256
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
last_failure: breaker.last_failure_time,
|
|
569
|
-
)
|
|
257
|
+
# Backward compatibility: Execute a job once without retry.
|
|
258
|
+
# This is used by ExecutionStrategy classes.
|
|
259
|
+
#
|
|
260
|
+
# @param job [Job] The job to execute
|
|
261
|
+
# @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
|
|
262
|
+
# @return [Object] The job output
|
|
263
|
+
def execute_job_once(job, job_trace = nil)
|
|
264
|
+
@job_executor.execute_once(job, job_trace)
|
|
570
265
|
end
|
|
571
266
|
|
|
267
|
+
# Backward compatibility: Add failed job to dead letter queue.
|
|
268
|
+
# This is used by ExecutionStrategy classes.
|
|
269
|
+
#
|
|
270
|
+
# @param job [Job] The job that failed
|
|
271
|
+
# @param error [Exception] The error that occurred
|
|
272
|
+
# @param retry_state [Object, nil] Optional retry state
|
|
572
273
|
def add_to_dead_letter_queue(job, error, retry_state = nil)
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
# Build job input for DLQ entry
|
|
576
|
-
job_input = @context.build_job_input(job)
|
|
577
|
-
work = Work.new(job_input)
|
|
578
|
-
|
|
579
|
-
# Add metadata about the failure
|
|
580
|
-
metadata = {
|
|
581
|
-
job_name: job.name,
|
|
582
|
-
worker_class: job.worker_class.name,
|
|
583
|
-
correlation_id: @context.correlation_id,
|
|
584
|
-
workflow_name: @workflow.class.workflow_name,
|
|
585
|
-
}
|
|
586
|
-
|
|
587
|
-
# Add retry information if available
|
|
588
|
-
if retry_state
|
|
589
|
-
# Handle both RetryState object and Hash from orchestrator
|
|
590
|
-
if retry_state.is_a?(Hash)
|
|
591
|
-
# From RetryOrchestrator.state
|
|
592
|
-
metadata[:retry_attempts] = retry_state[:attempts] - 1
|
|
593
|
-
metadata[:max_attempts] = retry_state[:max_attempts]
|
|
594
|
-
metadata[:last_error] = retry_state[:last_error]
|
|
595
|
-
metadata[:total_retry_time] = retry_state[:total_time]
|
|
596
|
-
metadata[:all_errors] = retry_state[:all_errors]
|
|
597
|
-
else
|
|
598
|
-
# From RetryState object
|
|
599
|
-
metadata[:retry_attempts] = retry_state.attempt - 1
|
|
600
|
-
metadata[:total_retry_time] = retry_state.total_time
|
|
601
|
-
metadata[:all_errors] = retry_state.summary[:errors]
|
|
602
|
-
end
|
|
603
|
-
end
|
|
604
|
-
|
|
605
|
-
# Add context from workflow
|
|
606
|
-
context = {
|
|
607
|
-
workflow_input: @context.workflow_input,
|
|
608
|
-
completed_jobs: @completed_jobs.to_a,
|
|
609
|
-
failed_jobs: @failed_jobs.to_a,
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
@dead_letter_queue.add(work, error, context: context,
|
|
613
|
-
metadata: metadata)
|
|
614
|
-
|
|
615
|
-
log_added_to_dlq(job, error) if @context.logger
|
|
616
|
-
end
|
|
617
|
-
|
|
618
|
-
def log_added_to_dlq(job, error)
|
|
619
|
-
@context.logger.warn(
|
|
620
|
-
"Work added to Dead Letter Queue",
|
|
621
|
-
job: job.name,
|
|
622
|
-
error: error.class.name,
|
|
623
|
-
message: error.message,
|
|
624
|
-
dlq_size: @dead_letter_queue.size,
|
|
625
|
-
)
|
|
626
|
-
end
|
|
627
|
-
|
|
628
|
-
def build_result(start_time, end_time)
|
|
629
|
-
# Find the output from the end job
|
|
630
|
-
output = find_workflow_output
|
|
631
|
-
|
|
632
|
-
WorkflowResult.new(
|
|
633
|
-
workflow_name: workflow.class.workflow_name,
|
|
634
|
-
output: output,
|
|
635
|
-
completed_jobs: @completed_jobs.to_a,
|
|
636
|
-
failed_jobs: @failed_jobs.to_a,
|
|
637
|
-
execution_time: end_time - start_time,
|
|
638
|
-
success: @failed_jobs.empty?,
|
|
639
|
-
trace: @trace,
|
|
640
|
-
correlation_id: @context.correlation_id,
|
|
641
|
-
)
|
|
274
|
+
@job_executor.send(:add_to_dead_letter_queue, job, error, retry_state)
|
|
642
275
|
end
|
|
643
276
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
end
|
|
652
|
-
end
|
|
653
|
-
|
|
654
|
-
# Fallback: return output from the first end job that completed
|
|
655
|
-
workflow.class.end_job_names.each do |end_job_spec|
|
|
656
|
-
job_name = end_job_spec[:name]
|
|
657
|
-
if @completed_jobs.include?(job_name)
|
|
658
|
-
output = @context.job_output(job_name)
|
|
659
|
-
puts "Using end job '#{job_name}' output: #{output.class}" if ENV["FRACTOR_DEBUG"]
|
|
660
|
-
return output
|
|
661
|
-
end
|
|
662
|
-
end
|
|
277
|
+
# Backward compatibility: Execute jobs in parallel.
|
|
278
|
+
# This is used by ExecutionStrategy classes.
|
|
279
|
+
# Note: Current implementation executes jobs sequentially to avoid Ractor issues.
|
|
280
|
+
#
|
|
281
|
+
# @param jobs [Array<Job>] Jobs to execute
|
|
282
|
+
def execute_jobs_parallel(jobs)
|
|
283
|
+
puts "Executing #{jobs.size} jobs in parallel: #{jobs.map(&:name).join(', ')}" if ENV["FRACTOR_DEBUG"]
|
|
663
284
|
|
|
664
|
-
|
|
665
|
-
|
|
285
|
+
# Execute sequentially for now (parallel execution with Ractors has issues)
|
|
286
|
+
jobs.each { |job| execute_job(job) }
|
|
666
287
|
end
|
|
667
288
|
end
|
|
668
289
|
end
|