fractor 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop-https---raw-githubusercontent-com-riboseinc-oss-guides-main-ci-rubocop-yml +552 -0
- data/.rubocop.yml +14 -8
- data/.rubocop_todo.yml +284 -43
- data/README.adoc +111 -950
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/auto_detection/auto_detection.rb +9 -9
- data/examples/continuous_chat_common/message_protocol.rb +53 -0
- data/examples/continuous_chat_fractor/README.adoc +217 -0
- data/examples/continuous_chat_fractor/chat_client.rb +303 -0
- data/examples/continuous_chat_fractor/chat_common.rb +83 -0
- data/examples/continuous_chat_fractor/chat_server.rb +167 -0
- data/examples/continuous_chat_fractor/simulate.rb +345 -0
- data/examples/continuous_chat_server/README.adoc +135 -0
- data/examples/continuous_chat_server/chat_client.rb +303 -0
- data/examples/continuous_chat_server/chat_server.rb +359 -0
- data/examples/continuous_chat_server/simulate.rb +343 -0
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/hierarchical_hasher/hierarchical_hasher.rb +12 -8
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/multi_work_type/multi_work_type.rb +30 -29
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +16 -16
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/producer_subscriber/producer_subscriber.rb +20 -16
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/scatter_gather/scatter_gather.rb +29 -28
- data/examples/simple/README.adoc +347 -0
- data/examples/simple/sample.rb +5 -5
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +88 -45
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +183 -0
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +33 -1
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +430 -144
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +88 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +75 -1
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -91
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +93 -3
- metadata +192 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -0,0 +1,1192 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Error Handling
|
|
4
|
+
nav_order: 4
|
|
5
|
+
---
|
|
6
|
+
== Error Reporting and Analytics
|
|
7
|
+
|
|
8
|
+
== Overview
|
|
9
|
+
|
|
10
|
+
Fractor provides comprehensive error reporting and analytics through the `ErrorReporter` class. This system aggregates errors, tracks statistics, detects trends, and provides actionable insights into application health.
|
|
11
|
+
|
|
12
|
+
== Purpose
|
|
13
|
+
|
|
14
|
+
The ErrorReporter helps you:
|
|
15
|
+
|
|
16
|
+
* **Monitor error patterns** across your application
|
|
17
|
+
* **Identify problematic jobs** with high error rates
|
|
18
|
+
* **Detect trending issues** before they become critical
|
|
19
|
+
* **Track error severity** and categorization
|
|
20
|
+
* **Export metrics** to monitoring systems
|
|
21
|
+
* **Respond to critical errors** in real-time
|
|
22
|
+
|
|
23
|
+
== Basic Usage
|
|
24
|
+
|
|
25
|
+
=== Setup
|
|
26
|
+
|
|
27
|
+
[source,ruby]
|
|
28
|
+
----
|
|
29
|
+
require 'fractor'
|
|
30
|
+
|
|
31
|
+
# Create an error reporter instance
|
|
32
|
+
reporter = Fractor::ErrorReporter.new
|
|
33
|
+
----
|
|
34
|
+
|
|
35
|
+
=== Recording Work Results
|
|
36
|
+
|
|
37
|
+
The ErrorReporter tracks both successes and failures:
|
|
38
|
+
|
|
39
|
+
[source,ruby]
|
|
40
|
+
----
|
|
41
|
+
# Record a successful result
|
|
42
|
+
work_result = Fractor::WorkResult.new(result: "Success")
|
|
43
|
+
reporter.record(work_result)
|
|
44
|
+
|
|
45
|
+
# Record an error result
|
|
46
|
+
work_result = Fractor::WorkResult.new(
|
|
47
|
+
error: StandardError.new("Connection failed"),
|
|
48
|
+
error_code: :connection_failed,
|
|
49
|
+
error_category: :network,
|
|
50
|
+
error_severity: :error
|
|
51
|
+
)
|
|
52
|
+
reporter.record(work_result, job_name: "fetch_data")
|
|
53
|
+
----
|
|
54
|
+
|
|
55
|
+
=== Viewing Statistics
|
|
56
|
+
|
|
57
|
+
[source,ruby]
|
|
58
|
+
----
|
|
59
|
+
# Overall statistics
|
|
60
|
+
puts "Total Errors: #{reporter.total_errors}"
|
|
61
|
+
puts "Total Successes: #{reporter.total_successes}"
|
|
62
|
+
puts "Error Rate: #{reporter.overall_error_rate}%"
|
|
63
|
+
|
|
64
|
+
# Top error categories
|
|
65
|
+
reporter.top_categories.each do |category, count|
|
|
66
|
+
puts "#{category}: #{count} errors"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Top error jobs
|
|
70
|
+
reporter.top_jobs.each do |job, count|
|
|
71
|
+
puts "#{job}: #{count} errors"
|
|
72
|
+
end
|
|
73
|
+
----
|
|
74
|
+
|
|
75
|
+
== Error Categorization
|
|
76
|
+
|
|
77
|
+
Fractor automatically categorizes errors based on their type:
|
|
78
|
+
|
|
79
|
+
[cols="1,2,3"]
|
|
80
|
+
|===
|
|
81
|
+
|Category |Error Types |Description
|
|
82
|
+
|
|
83
|
+
|`:validation`
|
|
84
|
+
|`ArgumentError`, `TypeError`
|
|
85
|
+
|Input validation errors
|
|
86
|
+
|
|
87
|
+
|`:timeout`
|
|
88
|
+
|`Timeout::Error`
|
|
89
|
+
|Operation timeout errors
|
|
90
|
+
|
|
91
|
+
|`:network`
|
|
92
|
+
|`SocketError`, `Errno::ECONNREFUSED`, `Errno::ETIMEDOUT`
|
|
93
|
+
|Network-related errors
|
|
94
|
+
|
|
95
|
+
|`:resource`
|
|
96
|
+
|`Errno::ENOMEM`, `Errno::ENOSPC`
|
|
97
|
+
|Resource exhaustion errors
|
|
98
|
+
|
|
99
|
+
|`:system`
|
|
100
|
+
|`SystemCallError`, `SystemStackError`
|
|
101
|
+
|System-level errors
|
|
102
|
+
|
|
103
|
+
|`:business`
|
|
104
|
+
|Custom business logic errors
|
|
105
|
+
|Application-specific errors
|
|
106
|
+
|
|
107
|
+
|`:unknown`
|
|
108
|
+
|Other errors
|
|
109
|
+
|Uncategorized errors
|
|
110
|
+
|===
|
|
111
|
+
|
|
112
|
+
== Error Severity Levels
|
|
113
|
+
|
|
114
|
+
Errors are assigned severity levels:
|
|
115
|
+
|
|
116
|
+
[cols="1,3"]
|
|
117
|
+
|===
|
|
118
|
+
|Severity |Description
|
|
119
|
+
|
|
120
|
+
|`:critical`
|
|
121
|
+
|System-breaking errors requiring immediate attention
|
|
122
|
+
|
|
123
|
+
|`:error`
|
|
124
|
+
|Standard errors that prevent operation completion
|
|
125
|
+
|
|
126
|
+
|`:warning`
|
|
127
|
+
|Non-fatal issues that may need investigation
|
|
128
|
+
|
|
129
|
+
|`:info`
|
|
130
|
+
|Informational messages
|
|
131
|
+
|===
|
|
132
|
+
|
|
133
|
+
== Real-Time Error Handlers
|
|
134
|
+
|
|
135
|
+
Register callbacks to respond to errors as they occur:
|
|
136
|
+
|
|
137
|
+
=== Basic Handler
|
|
138
|
+
|
|
139
|
+
[source,ruby]
|
|
140
|
+
----
|
|
141
|
+
reporter.on_error do |work_result, job_name|
|
|
142
|
+
puts "Error in #{job_name}: #{work_result.error.message}"
|
|
143
|
+
end
|
|
144
|
+
----
|
|
145
|
+
|
|
146
|
+
=== Critical Error Alerts
|
|
147
|
+
|
|
148
|
+
[source,ruby]
|
|
149
|
+
----
|
|
150
|
+
reporter.on_error do |work_result, job_name|
|
|
151
|
+
if work_result.critical?
|
|
152
|
+
# Send alert to operations team
|
|
153
|
+
AlertService.notify(
|
|
154
|
+
severity: "critical",
|
|
155
|
+
job: job_name,
|
|
156
|
+
error: work_result.error.message,
|
|
157
|
+
context: work_result.error_context
|
|
158
|
+
)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
----
|
|
162
|
+
|
|
163
|
+
=== Multiple Handlers
|
|
164
|
+
|
|
165
|
+
You can register multiple handlers for different purposes:
|
|
166
|
+
|
|
167
|
+
[source,ruby]
|
|
168
|
+
----
|
|
169
|
+
# Handler 1: Log all errors
|
|
170
|
+
reporter.on_error do |work_result, job_name|
|
|
171
|
+
Logger.error("Job #{job_name} failed: #{work_result.error.message}")
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Handler 2: Send metrics
|
|
175
|
+
reporter.on_error do |work_result, job_name|
|
|
176
|
+
Metrics.increment("errors.#{work_result.error_category}")
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Handler 3: Alert on critical errors
|
|
180
|
+
reporter.on_error do |work_result, job_name|
|
|
181
|
+
AlertService.notify(work_result) if work_result.critical?
|
|
182
|
+
end
|
|
183
|
+
----
|
|
184
|
+
|
|
185
|
+
== Generating Reports
|
|
186
|
+
|
|
187
|
+
=== Formatted Text Report
|
|
188
|
+
|
|
189
|
+
Generate a human-readable report:
|
|
190
|
+
|
|
191
|
+
[source,ruby]
|
|
192
|
+
----
|
|
193
|
+
puts reporter.formatted_report
|
|
194
|
+
----
|
|
195
|
+
|
|
196
|
+
Output example:
|
|
197
|
+
|
|
198
|
+
[source]
|
|
199
|
+
----
|
|
200
|
+
================================================================================
|
|
201
|
+
ERROR REPORT
|
|
202
|
+
================================================================================
|
|
203
|
+
|
|
204
|
+
SUMMARY
|
|
205
|
+
--------------------------------------------------------------------------------
|
|
206
|
+
Uptime: 127.45s
|
|
207
|
+
Total Errors: 15
|
|
208
|
+
Total Successes: 85
|
|
209
|
+
Error Rate: 15.0%
|
|
210
|
+
|
|
211
|
+
Errors by Severity:
|
|
212
|
+
critical : 1
|
|
213
|
+
error : 12
|
|
214
|
+
warning : 2
|
|
215
|
+
|
|
216
|
+
TOP ERROR CATEGORIES
|
|
217
|
+
--------------------------------------------------------------------------------
|
|
218
|
+
network : 8 errors
|
|
219
|
+
validation : 5 errors
|
|
220
|
+
timeout : 2 errors
|
|
221
|
+
|
|
222
|
+
TOP ERROR JOBS
|
|
223
|
+
--------------------------------------------------------------------------------
|
|
224
|
+
fetch_data : 8 errors
|
|
225
|
+
process_data : 5 errors
|
|
226
|
+
validate_input : 2 errors
|
|
227
|
+
|
|
228
|
+
CRITICAL ERRORS
|
|
229
|
+
--------------------------------------------------------------------------------
|
|
230
|
+
Category: system
|
|
231
|
+
Count: 1
|
|
232
|
+
Recent errors:
|
|
233
|
+
- [2025-01-15 10:30:45] SystemStackError: Stack overflow
|
|
234
|
+
|
|
235
|
+
TRENDING ERRORS (Increasing)
|
|
236
|
+
--------------------------------------------------------------------------------
|
|
237
|
+
Category: network
|
|
238
|
+
Total Count: 8
|
|
239
|
+
Error Rate: 0.06/s
|
|
240
|
+
Trend: increasing
|
|
241
|
+
================================================================================
|
|
242
|
+
----
|
|
243
|
+
|
|
244
|
+
=== Programmatic Access
|
|
245
|
+
|
|
246
|
+
[source,ruby]
|
|
247
|
+
----
|
|
248
|
+
report = reporter.report
|
|
249
|
+
|
|
250
|
+
# Access specific sections
|
|
251
|
+
summary = report[:summary]
|
|
252
|
+
puts "Uptime: #{summary[:uptime]}s"
|
|
253
|
+
puts "Error Rate: #{summary[:error_rate]}%"
|
|
254
|
+
|
|
255
|
+
# Critical errors
|
|
256
|
+
report[:critical_errors].each do |error_info|
|
|
257
|
+
puts "Critical in #{error_info[:category]}: #{error_info[:count]} errors"
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Trending errors
|
|
261
|
+
report[:trending_errors].each do |trend|
|
|
262
|
+
puts "Trending: #{trend[:category]}"
|
|
263
|
+
end
|
|
264
|
+
----
|
|
265
|
+
|
|
266
|
+
== Exporting Metrics
|
|
267
|
+
|
|
268
|
+
=== Prometheus Format
|
|
269
|
+
|
|
270
|
+
Export metrics for Prometheus monitoring:
|
|
271
|
+
|
|
272
|
+
[source,ruby]
|
|
273
|
+
----
|
|
274
|
+
# Write to file
|
|
275
|
+
File.write("metrics.txt", reporter.to_prometheus)
|
|
276
|
+
|
|
277
|
+
# Or serve via HTTP endpoint
|
|
278
|
+
get '/metrics' do
|
|
279
|
+
content_type 'text/plain'
|
|
280
|
+
reporter.to_prometheus
|
|
281
|
+
end
|
|
282
|
+
----
|
|
283
|
+
|
|
284
|
+
Example output:
|
|
285
|
+
|
|
286
|
+
[source]
|
|
287
|
+
----
|
|
288
|
+
# HELP fractor_errors_total Total number of errors
|
|
289
|
+
# TYPE fractor_errors_total counter
|
|
290
|
+
fractor_errors_total 15
|
|
291
|
+
|
|
292
|
+
# HELP fractor_successes_total Total number of successes
|
|
293
|
+
# TYPE fractor_successes_total counter
|
|
294
|
+
fractor_successes_total 85
|
|
295
|
+
|
|
296
|
+
# HELP fractor_error_rate Error rate percentage
|
|
297
|
+
# TYPE fractor_error_rate gauge
|
|
298
|
+
fractor_error_rate 15.0
|
|
299
|
+
|
|
300
|
+
# HELP fractor_errors_by_severity Errors by severity level
|
|
301
|
+
# TYPE fractor_errors_by_severity gauge
|
|
302
|
+
fractor_errors_by_severity{severity="critical"} 1
|
|
303
|
+
fractor_errors_by_severity{severity="error"} 12
|
|
304
|
+
fractor_errors_by_severity{severity="warning"} 2
|
|
305
|
+
|
|
306
|
+
# HELP fractor_errors_by_category Errors by category
|
|
307
|
+
# TYPE fractor_errors_by_category gauge
|
|
308
|
+
fractor_errors_by_category{category="network"} 8
|
|
309
|
+
fractor_errors_by_category{category="validation"} 5
|
|
310
|
+
fractor_errors_by_category{category="timeout"} 2
|
|
311
|
+
----
|
|
312
|
+
|
|
313
|
+
=== JSON Format
|
|
314
|
+
|
|
315
|
+
Export as JSON for programmatic consumption:
|
|
316
|
+
|
|
317
|
+
[source,ruby]
|
|
318
|
+
----
|
|
319
|
+
# Write to file
|
|
320
|
+
File.write("error_report.json", reporter.to_json)
|
|
321
|
+
|
|
322
|
+
# Or serve via API
|
|
323
|
+
get '/api/errors' do
|
|
324
|
+
content_type 'application/json'
|
|
325
|
+
reporter.to_json
|
|
326
|
+
end
|
|
327
|
+
----
|
|
328
|
+
|
|
329
|
+
== Job-Specific Statistics
|
|
330
|
+
|
|
331
|
+
Get detailed statistics for a specific job:
|
|
332
|
+
|
|
333
|
+
[source,ruby]
|
|
334
|
+
----
|
|
335
|
+
stats = reporter.job_stats("fetch_data")
|
|
336
|
+
|
|
337
|
+
puts "Job: fetch_data"
|
|
338
|
+
puts "Total Errors: #{stats[:total_count]}"
|
|
339
|
+
puts "Error Rate: #{stats[:error_rate]}/s"
|
|
340
|
+
puts "Most Common Error: #{stats[:most_common_code]}"
|
|
341
|
+
puts "Highest Severity: #{stats[:highest_severity]}"
|
|
342
|
+
puts "Trend: #{stats[:trending]}"
|
|
343
|
+
----
|
|
344
|
+
|
|
345
|
+
== Category-Specific Statistics
|
|
346
|
+
|
|
347
|
+
Get detailed statistics for an error category:
|
|
348
|
+
|
|
349
|
+
[source,ruby]
|
|
350
|
+
----
|
|
351
|
+
stats = reporter.category_stats(:network)
|
|
352
|
+
|
|
353
|
+
puts "Category: network"
|
|
354
|
+
puts "Total Count: #{stats[:total_count]}"
|
|
355
|
+
puts "Error Rate: #{stats[:error_rate]}/s"
|
|
356
|
+
puts "By Severity: #{stats[:by_severity]}"
|
|
357
|
+
puts "By Code: #{stats[:by_code]}"
|
|
358
|
+
puts "Trending: #{stats[:trending]}"
|
|
359
|
+
----
|
|
360
|
+
|
|
361
|
+
== Detecting Trends
|
|
362
|
+
|
|
363
|
+
The ErrorReporter automatically detects increasing error rates:
|
|
364
|
+
|
|
365
|
+
[source,ruby]
|
|
366
|
+
----
|
|
367
|
+
# Get all trending errors
|
|
368
|
+
trending = reporter.trending_errors
|
|
369
|
+
|
|
370
|
+
trending.each do |trend|
|
|
371
|
+
category = trend[:category]
|
|
372
|
+
stats = trend[:stats]
|
|
373
|
+
|
|
374
|
+
puts "⚠️ #{category} errors are increasing!"
|
|
375
|
+
puts " Count: #{stats[:total_count]}"
|
|
376
|
+
puts " Rate: #{stats[:error_rate]}/s"
|
|
377
|
+
end
|
|
378
|
+
----
|
|
379
|
+
|
|
380
|
+
== Integration Examples
|
|
381
|
+
|
|
382
|
+
=== With Supervisor
|
|
383
|
+
|
|
384
|
+
[source,ruby]
|
|
385
|
+
----
|
|
386
|
+
supervisor = Fractor::Supervisor.new(
|
|
387
|
+
worker_pools: [{ worker_class: MyWorker, count: 4 }]
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
reporter = Fractor::ErrorReporter.new
|
|
391
|
+
|
|
392
|
+
# Record results as they complete
|
|
393
|
+
supervisor.on_result do |result|
|
|
394
|
+
reporter.record(result)
|
|
395
|
+
end
|
|
396
|
+
----
|
|
397
|
+
|
|
398
|
+
=== With Workflows
|
|
399
|
+
|
|
400
|
+
[source,ruby]
|
|
401
|
+
----
|
|
402
|
+
class MyWorkflow < Fractor::Workflow
|
|
403
|
+
workflow "monitored-workflow" do
|
|
404
|
+
job "process" do
|
|
405
|
+
runs_with ProcessWorker
|
|
406
|
+
|
|
407
|
+
on_error do |error, context|
|
|
408
|
+
# Report error
|
|
409
|
+
reporter.record(
|
|
410
|
+
Fractor::WorkResult.new(
|
|
411
|
+
error: error,
|
|
412
|
+
error_context: context
|
|
413
|
+
),
|
|
414
|
+
job_name: "process"
|
|
415
|
+
)
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
end
|
|
420
|
+
----
|
|
421
|
+
|
|
422
|
+
=== Periodic Reporting
|
|
423
|
+
|
|
424
|
+
[source,ruby]
|
|
425
|
+
----
|
|
426
|
+
# Report every 5 minutes
|
|
427
|
+
Thread.new do
|
|
428
|
+
loop do
|
|
429
|
+
sleep 300 # 5 minutes
|
|
430
|
+
|
|
431
|
+
# Log summary
|
|
432
|
+
Logger.info("Error Summary: #{reporter.overall_error_rate}% error rate")
|
|
433
|
+
|
|
434
|
+
# Alert on high error rates
|
|
435
|
+
if reporter.overall_error_rate > 10.0
|
|
436
|
+
AlertService.notify("High error rate detected!")
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
# Check for trending errors
|
|
440
|
+
reporter.trending_errors.each do |trend|
|
|
441
|
+
AlertService.notify("Trending: #{trend[:category]}")
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
----
|
|
446
|
+
|
|
447
|
+
== Production Best Practices
|
|
448
|
+
|
|
449
|
+
=== 1. Set Up Monitoring
|
|
450
|
+
|
|
451
|
+
[source,ruby]
|
|
452
|
+
----
|
|
453
|
+
# Configure Prometheus scraping
|
|
454
|
+
# In config/prometheus.yml:
|
|
455
|
+
# scrape_configs:
|
|
456
|
+
# - job_name: 'fractor'
|
|
457
|
+
# static_configs:
|
|
458
|
+
# - targets: ['localhost:9090']
|
|
459
|
+
|
|
460
|
+
# Serve metrics endpoint
|
|
461
|
+
require 'sinatra'
|
|
462
|
+
|
|
463
|
+
get '/metrics' do
|
|
464
|
+
content_type 'text/plain'
|
|
465
|
+
$error_reporter.to_prometheus
|
|
466
|
+
end
|
|
467
|
+
----
|
|
468
|
+
|
|
469
|
+
=== 2. Configure Alerts
|
|
470
|
+
|
|
471
|
+
[source,ruby]
|
|
472
|
+
----
|
|
473
|
+
reporter.on_error do |work_result, job_name|
|
|
474
|
+
case work_result.error_severity
|
|
475
|
+
when :critical
|
|
476
|
+
PagerDuty.alert(work_result, job_name)
|
|
477
|
+
when :error
|
|
478
|
+
Slack.notify(work_result, job_name) if should_notify?(work_result)
|
|
479
|
+
when :warning
|
|
480
|
+
Logger.warn("#{job_name}: #{work_result.error.message}")
|
|
481
|
+
end
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
def should_notify?(work_result)
|
|
485
|
+
# Only notify for non-retriable errors or after multiple failures
|
|
486
|
+
!work_result.retriable? || failure_count(work_result) > 3
|
|
487
|
+
end
|
|
488
|
+
----
|
|
489
|
+
|
|
490
|
+
=== 3. Regular Health Checks
|
|
491
|
+
|
|
492
|
+
[source,ruby]
|
|
493
|
+
----
|
|
494
|
+
# Run health checks every minute
|
|
495
|
+
Thread.new do
|
|
496
|
+
loop do
|
|
497
|
+
sleep 60
|
|
498
|
+
|
|
499
|
+
# Check critical errors
|
|
500
|
+
critical = reporter.critical_errors
|
|
501
|
+
if critical.any?
|
|
502
|
+
PagerDuty.alert("Critical errors detected: #{critical.size}")
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
# Check error rate
|
|
506
|
+
if reporter.overall_error_rate > 25.0
|
|
507
|
+
Slack.notify("High error rate: #{reporter.overall_error_rate}%")
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
# Check trending
|
|
511
|
+
if reporter.trending_errors.any?
|
|
512
|
+
Slack.notify("Trending errors detected")
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
----
|
|
517
|
+
|
|
518
|
+
=== 4. Data Retention
|
|
519
|
+
|
|
520
|
+
[source,ruby]
|
|
521
|
+
----
|
|
522
|
+
# Reset statistics daily to prevent unbounded memory growth
|
|
523
|
+
Thread.new do
|
|
524
|
+
loop do
|
|
525
|
+
sleep 86400 # 24 hours
|
|
526
|
+
|
|
527
|
+
# Archive current stats
|
|
528
|
+
File.write(
|
|
529
|
+
"error_report_#{Date.today}.json",
|
|
530
|
+
reporter.to_json
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Reset for new day
|
|
534
|
+
reporter.reset
|
|
535
|
+
Logger.info("Error reporter statistics reset")
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
----
|
|
539
|
+
|
|
540
|
+
== Advanced Features
|
|
541
|
+
|
|
542
|
+
=== Custom Error Categorization
|
|
543
|
+
|
|
544
|
+
Override the default categorization:
|
|
545
|
+
|
|
546
|
+
[source,ruby]
|
|
547
|
+
----
|
|
548
|
+
class MyCustomError < StandardError; end
|
|
549
|
+
|
|
550
|
+
# In your worker
|
|
551
|
+
def process(work)
|
|
552
|
+
raise MyCustomError, "Custom error"
|
|
553
|
+
rescue MyCustomError => e
|
|
554
|
+
Fractor::WorkResult.new(
|
|
555
|
+
error: e,
|
|
556
|
+
error_category: :business, # Custom category
|
|
557
|
+
error_code: :custom_failure,
|
|
558
|
+
error_severity: :error,
|
|
559
|
+
work: work
|
|
560
|
+
)
|
|
561
|
+
end
|
|
562
|
+
----
|
|
563
|
+
|
|
564
|
+
=== Error Context Enrichment
|
|
565
|
+
|
|
566
|
+
Add contextual information to errors:
|
|
567
|
+
|
|
568
|
+
[source,ruby]
|
|
569
|
+
----
|
|
570
|
+
def process(work)
|
|
571
|
+
start_time = Time.now
|
|
572
|
+
# ... processing ...
|
|
573
|
+
rescue => e
|
|
574
|
+
Fractor::WorkResult.new(
|
|
575
|
+
error: e,
|
|
576
|
+
error_context: {
|
|
577
|
+
duration: Time.now - start_time,
|
|
578
|
+
input_size: work.input.size,
|
|
579
|
+
memory_used: get_memory_usage,
|
|
580
|
+
retry_count: work.retry_count,
|
|
581
|
+
worker_id: Thread.current.object_id
|
|
582
|
+
},
|
|
583
|
+
work: work
|
|
584
|
+
)
|
|
585
|
+
end
|
|
586
|
+
----
|
|
587
|
+
|
|
588
|
+
=== Filtering and Analysis
|
|
589
|
+
|
|
590
|
+
[source,ruby]
|
|
591
|
+
----
|
|
592
|
+
# Get errors by specific criteria
|
|
593
|
+
report = reporter.report
|
|
594
|
+
|
|
595
|
+
# High-severity errors
|
|
596
|
+
high_severity = report[:category_breakdown].select do |category, stats|
|
|
597
|
+
stats[:highest_severity] == :critical ||
|
|
598
|
+
stats[:highest_severity] == :error
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
# Categories with high error rates
|
|
602
|
+
high_rate = report[:category_breakdown].select do |category, stats|
|
|
603
|
+
stats[:error_rate] > 1.0 # More than 1 error per second
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
# Recent spikes
|
|
607
|
+
recent_spikes = report[:trending_errors].select do |trend|
|
|
608
|
+
trend[:stats][:trending] == "increasing"
|
|
609
|
+
end
|
|
610
|
+
----
|
|
611
|
+
|
|
612
|
+
== Orchestrators
|
|
613
|
+
|
|
614
|
+
Fractor provides two powerful orchestrator patterns for resilient error handling: retry logic with configurable backoff strategies, and circuit breaker pattern for preventing cascading failures.
|
|
615
|
+
|
|
616
|
+
=== Retry Orchestrator
|
|
617
|
+
|
|
618
|
+
The `RetryOrchestrator` manages retry logic with configurable backoff strategies. It tracks retry attempts, calculates delays, and provides detailed state information.
|
|
619
|
+
|
|
620
|
+
==== Purpose
|
|
621
|
+
|
|
622
|
+
The RetryOrchestrator helps you:
|
|
623
|
+
|
|
624
|
+
* **Handle transient failures** with automatic retries
|
|
625
|
+
* **Configure backoff strategies** (exponential, linear, constant)
|
|
626
|
+
* **Track all retry attempts** for debugging and analysis
|
|
627
|
+
* **Control retry behavior** with max attempts and retryable error types
|
|
628
|
+
|
|
629
|
+
==== Basic Usage
|
|
630
|
+
|
|
631
|
+
[source,ruby]
|
|
632
|
+
----
|
|
633
|
+
require 'fractor'
|
|
634
|
+
|
|
635
|
+
# Create a retry configuration with exponential backoff
|
|
636
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
637
|
+
backoff: :exponential,
|
|
638
|
+
base_delay: 1, # 1 second base
|
|
639
|
+
max_delay: 60, # max 60 seconds
|
|
640
|
+
multiplier: 2, # double each time
|
|
641
|
+
max_attempts: 5,
|
|
642
|
+
retryable_errors: [Timeout::Error, Errno::ECONNREFUSED]
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# Create orchestrator
|
|
646
|
+
orchestrator = Fractor::Workflow::RetryOrchestrator.new(
|
|
647
|
+
config,
|
|
648
|
+
debug: true
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# Execute with retry logic
|
|
652
|
+
result = orchestrator.execute_with_retry(job) do |job|
|
|
653
|
+
# Job execution logic
|
|
654
|
+
execute_job(job)
|
|
655
|
+
end
|
|
656
|
+
----
|
|
657
|
+
|
|
658
|
+
==== Retry Strategies
|
|
659
|
+
|
|
660
|
+
===== Exponential Backoff (Default)
|
|
661
|
+
|
|
662
|
+
Exponential backoff increases delay between retries exponentially:
|
|
663
|
+
|
|
664
|
+
[source,ruby]
|
|
665
|
+
----
|
|
666
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
667
|
+
backoff: :exponential,
|
|
668
|
+
base_delay: 1, # Start at 1 second
|
|
669
|
+
max_delay: 60, # Cap at 60 seconds
|
|
670
|
+
multiplier: 2 # Double each retry
|
|
671
|
+
)
|
|
672
|
+
# Delays: 1s, 2s, 4s, 8s, 16s, 32s, 60s, 60s, ...
|
|
673
|
+
----
|
|
674
|
+
|
|
675
|
+
===== Linear Backoff
|
|
676
|
+
|
|
677
|
+
Linear backoff increases delay by a fixed amount each retry:
|
|
678
|
+
|
|
679
|
+
[source,ruby]
|
|
680
|
+
----
|
|
681
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
682
|
+
backoff: :linear,
|
|
683
|
+
base_delay: 5, # Start at 5 seconds
|
|
684
|
+
increment: 5 # Add 5 seconds each retry
|
|
685
|
+
)
|
|
686
|
+
# Delays: 5s, 10s, 15s, 20s, 25s, ...
|
|
687
|
+
----
|
|
688
|
+
|
|
689
|
+
===== Constant Delay
|
|
690
|
+
|
|
691
|
+
Constant delay uses the same delay for all retries:
|
|
692
|
+
|
|
693
|
+
[source,ruby]
|
|
694
|
+
----
|
|
695
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
696
|
+
backoff: :constant,
|
|
697
|
+
delay: 10 # Always wait 10 seconds
|
|
698
|
+
)
|
|
699
|
+
# Delays: 10s, 10s, 10s, 10s, ...
|
|
700
|
+
----
|
|
701
|
+
|
|
702
|
+
===== Custom Backoff Strategy
|
|
703
|
+
|
|
704
|
+
Provide your own backoff calculation:
|
|
705
|
+
|
|
706
|
+
[source,ruby]
|
|
707
|
+
----
|
|
708
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
709
|
+
backoff: :custom,
|
|
710
|
+
calculator: ->(attempt) { attempt ** 3 } # Cubic backoff
|
|
711
|
+
)
|
|
712
|
+
# Delays: 1s, 8s, 27s, 64s, 125s, ...
|
|
713
|
+
----
|
|
714
|
+
|
|
715
|
+
==== Controlling Retryable Errors
|
|
716
|
+
|
|
717
|
+
By default, all `StandardError` exceptions are retried. Customize which errors are retryable:
|
|
718
|
+
|
|
719
|
+
[source,ruby]
|
|
720
|
+
----
|
|
721
|
+
# Only retry specific errors
|
|
722
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
723
|
+
retryable_errors: [Timeout::Error, Errno::ECONNREFUSED, Errno::ETIMEDOUT]
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
# Use a proc for complex logic
|
|
727
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
728
|
+
retryable_errors: ->(error) {
|
|
729
|
+
# Retry on network errors with specific messages
|
|
730
|
+
error.is_a?(Errno::ECONNREFUSED) && error.message.include?("temporary")
|
|
731
|
+
}
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
# Don't retry on specific errors
|
|
735
|
+
config = Fractor::Workflow::RetryConfig.from_options(
|
|
736
|
+
non_retryable_errors: [ArgumentError, ValidationError]
|
|
737
|
+
)
|
|
738
|
+
----
|
|
739
|
+
|
|
740
|
+
==== Accessing Retry State
|
|
741
|
+
|
|
742
|
+
Get detailed information about retry attempts:
|
|
743
|
+
|
|
744
|
+
[source,ruby]
|
|
745
|
+
----
|
|
746
|
+
state = orchestrator.state
|
|
747
|
+
# => {
|
|
748
|
+
# attempts: 3,
|
|
749
|
+
# max_attempts: 5,
|
|
750
|
+
# last_error: "Timeout::Error",
|
|
751
|
+
# exhausted: false,
|
|
752
|
+
# all_errors: [
|
|
753
|
+
# { attempt: 1, error_class: "Timeout::Error", error_message: "...", timestamp: ... },
|
|
754
|
+
# { attempt: 2, error_class: "Timeout::Error", error_message: "...", timestamp: ... }
|
|
755
|
+
# ],
|
|
756
|
+
# total_time: 12.5
|
|
757
|
+
# }
|
|
758
|
+
|
|
759
|
+
puts "Attempt #{state[:attempts]} of #{state[:max_attempts]}"
|
|
760
|
+
puts "Last error: #{state[:last_error]}"
|
|
761
|
+
puts "All errors: #{state[:all_errors].inspect}"
|
|
762
|
+
puts "Total time: #{state[:total_time]}s"
|
|
763
|
+
----
|
|
764
|
+
|
|
765
|
+
==== Integration with Workflows
|
|
766
|
+
|
|
767
|
+
[source,ruby]
|
|
768
|
+
----
|
|
769
|
+
class APIWorkflow < Fractor::Workflow
|
|
770
|
+
workflow "api-workflow" do
|
|
771
|
+
job "fetch_data" do
|
|
772
|
+
runs_with APIWorker
|
|
773
|
+
|
|
774
|
+
# Enable retry with exponential backoff
|
|
775
|
+
retry_config(
|
|
776
|
+
backoff: :exponential,
|
|
777
|
+
base_delay: 1,
|
|
778
|
+
max_delay: 30,
|
|
779
|
+
multiplier: 2,
|
|
780
|
+
max_attempts: 3
|
|
781
|
+
)
|
|
782
|
+
retry_on [Timeout::Error, Errno::ECONNREFUSED]
|
|
783
|
+
end
|
|
784
|
+
end
|
|
785
|
+
end
|
|
786
|
+
----
|
|
787
|
+
|
|
788
|
+
=== Circuit Breaker Orchestrator
|
|
789
|
+
|
|
790
|
+
The `CircuitBreakerOrchestrator` implements the circuit breaker pattern to prevent cascading failures. It automatically opens the circuit when failures exceed a threshold, blocking requests to failing services.
|
|
791
|
+
|
|
792
|
+
==== Purpose
|
|
793
|
+
|
|
794
|
+
The CircuitBreakerOrchestrator helps you:
|
|
795
|
+
|
|
796
|
+
* **Prevent cascading failures** by stopping requests to failing services
|
|
797
|
+
* **Enable automatic recovery** by testing service health periodically
|
|
798
|
+
* **Protect downstream services** from overload
|
|
799
|
+
* **Maintain system responsiveness** during partial outages
|
|
800
|
+
|
|
801
|
+
==== How It Works
|
|
802
|
+
|
|
803
|
+
The circuit breaker has three states:
|
|
804
|
+
|
|
805
|
+
[cols="1,2"]
|
|
806
|
+
|===
|
|
807
|
+
|State |Description
|
|
808
|
+
|
|
809
|
+
|`:closed` (default)
|
|
810
|
+
|Normal operation - requests pass through to the service
|
|
811
|
+
|
|
812
|
+
|`:open`
|
|
813
|
+
|Circuit is open - requests are immediately rejected without calling the service
|
|
814
|
+
|
|
815
|
+
|`:half_open`
|
|
816
|
+
|Testing recovery - limited requests are allowed to test if the service has recovered
|
|
817
|
+
|===
|
|
818
|
+
|
|
819
|
+
[source]
|
|
820
|
+
----
|
|
821
|
+
CLOSED ──(threshold failures)──> OPEN ──(timeout)──> HALF_OPEN ──(success)──> CLOSED
|
|
822
|
+
^ |
|
|
823
|
+
| v
|
|
824
|
+
<──────────(failure)─────────────────┘
|
|
825
|
+
----
|
|
826
|
+
|
|
827
|
+
==== Basic Usage
|
|
828
|
+
|
|
829
|
+
[source,ruby]
|
|
830
|
+
----
|
|
831
|
+
require 'fractor'
|
|
832
|
+
|
|
833
|
+
# Create a circuit breaker orchestrator
|
|
834
|
+
breaker = Fractor::Workflow::CircuitBreakerOrchestrator.new(
|
|
835
|
+
threshold: 5, # Open after 5 failures
|
|
836
|
+
timeout: 60, # Try recovery after 60 seconds
|
|
837
|
+
half_open_calls: 3, # Allow 3 test calls when half-open
|
|
838
|
+
job_name: "api_call",
|
|
839
|
+
debug: true
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# Execute with circuit breaker protection
|
|
843
|
+
begin
|
|
844
|
+
result = breaker.execute_with_breaker(job) do
|
|
845
|
+
# Call external service
|
|
846
|
+
ExternalAPI.call
|
|
847
|
+
end
|
|
848
|
+
rescue Fractor::Workflow::CircuitOpenError => e
|
|
849
|
+
# Circuit is open - request rejected
|
|
850
|
+
logger.warn("Circuit breaker open: #{e.message}")
|
|
851
|
+
# Return cached result or fallback
|
|
852
|
+
cached_response
|
|
853
|
+
end
|
|
854
|
+
----
|
|
855
|
+
|
|
856
|
+
==== Configuration Options
|
|
857
|
+
|
|
858
|
+
[cols="1,1,3"]
|
|
859
|
+
|===
|
|
860
|
+
|Parameter |Default |Description
|
|
861
|
+
|
|
862
|
+
|`threshold`
|
|
863
|
+
|5
|
|
864
|
+
|Number of failures before opening circuit
|
|
865
|
+
|
|
866
|
+
|`timeout`
|
|
867
|
+
|60
|
|
868
|
+
|Seconds to wait before attempting recovery (moving to half-open)
|
|
869
|
+
|
|
870
|
+
|`half_open_calls`
|
|
871
|
+
|3
|
|
872
|
+
|Number of successful calls needed to close circuit when half-open
|
|
873
|
+
|
|
874
|
+
|`job_name`
|
|
875
|
+
|nil
|
|
876
|
+
|Optional job name for logging/debugging
|
|
877
|
+
|
|
878
|
+
|`debug`
|
|
879
|
+
|false
|
|
880
|
+
|Enable debug logging
|
|
881
|
+
|===
|
|
882
|
+
|
|
883
|
+
==== Monitoring Circuit State
|
|
884
|
+
|
|
885
|
+
[source,ruby]
|
|
886
|
+
----
|
|
887
|
+
# Check current state
|
|
888
|
+
puts breaker.state # => :closed, :open, or :half_open
|
|
889
|
+
puts breaker.open? # => true if open
|
|
890
|
+
puts breaker.closed? # => true if closed
|
|
891
|
+
puts breaker.half_open? # => true if half-open
|
|
892
|
+
|
|
893
|
+
# Get detailed statistics
|
|
894
|
+
stats = breaker.stats
|
|
895
|
+
# => {
|
|
896
|
+
# state: :closed,
|
|
897
|
+
# failure_count: 2,
|
|
898
|
+
# threshold: 5,
|
|
899
|
+
# last_failure_time: nil,
|
|
900
|
+
# execution_count: 100,
|
|
901
|
+
# success_count: 98,
|
|
902
|
+
# blocked_count: 0
|
|
903
|
+
# }
|
|
904
|
+
|
|
905
|
+
# Get human-readable description
|
|
906
|
+
puts breaker.state_description
|
|
907
|
+
# => "CLOSED (normal operation)"
|
|
908
|
+
# => "OPEN (blocking requests, 5/5 failures)"
|
|
909
|
+
# => "HALF_OPEN (testing recovery, 2/3 successes)"
|
|
910
|
+
----
|
|
911
|
+
|
|
912
|
+
==== Manual Control
|
|
913
|
+
|
|
914
|
+
[source,ruby]
|
|
915
|
+
----
|
|
916
|
+
# Manually open circuit (emergency/maintenance)
|
|
917
|
+
breaker.open_circuit!
|
|
918
|
+
|
|
919
|
+
# Manually close circuit (forced recovery)
|
|
920
|
+
breaker.close_circuit!
|
|
921
|
+
|
|
922
|
+
# Reset all statistics
|
|
923
|
+
breaker.reset!
|
|
924
|
+
----
|
|
925
|
+
|
|
926
|
+
==== Bypassing the Circuit Breaker
|
|
927
|
+
|
|
928
|
+
Execute a call even when the circuit is open:
|
|
929
|
+
|
|
930
|
+
[source,ruby]
|
|
931
|
+
----
|
|
932
|
+
# Execute regardless of circuit state
|
|
933
|
+
result = breaker.execute_bypassing_breaker(job) do
|
|
934
|
+
# This will execute even if circuit is open
|
|
935
|
+
# Still tracks results for circuit breaker state
|
|
936
|
+
ExternalAPI.call
|
|
937
|
+
end
|
|
938
|
+
----
|
|
939
|
+
|
|
940
|
+
==== Integration with Workflows
|
|
941
|
+
|
|
942
|
+
[source,ruby]
|
|
943
|
+
----
|
|
944
|
+
class ExternalAPIWorkflow < Fractor::Workflow
|
|
945
|
+
workflow "external-api-workflow" do
|
|
946
|
+
job "fetch_from_api" do
|
|
947
|
+
runs_with ExternalAPIWorker
|
|
948
|
+
|
|
949
|
+
# Enable circuit breaker
|
|
950
|
+
circuit_breaker(
|
|
951
|
+
key: "external_api",
|
|
952
|
+
threshold: 5,
|
|
953
|
+
timeout: 60,
|
|
954
|
+
half_open_calls: 3
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Optional: fallback job when circuit is open
|
|
958
|
+
fallback_to "use_cache"
|
|
959
|
+
end
|
|
960
|
+
|
|
961
|
+
job "use_cache" do
|
|
962
|
+
runs_with CacheWorker
|
|
963
|
+
inputs_from_job "fetch_from_api"
|
|
964
|
+
end
|
|
965
|
+
end
|
|
966
|
+
end
|
|
967
|
+
----
|
|
968
|
+
|
|
969
|
+
==== Shared Circuit Breakers
|
|
970
|
+
|
|
971
|
+
Multiple jobs can share a circuit breaker to protect a single service:
|
|
972
|
+
|
|
973
|
+
[source,ruby]
|
|
974
|
+
----
|
|
975
|
+
class MultiStepAPIWorkflow < Fractor::Workflow
|
|
976
|
+
workflow "multi-step-api" do
|
|
977
|
+
# All three jobs share the same circuit breaker
|
|
978
|
+
job "fetch_users" do
|
|
979
|
+
runs_with UsersAPIWorker
|
|
980
|
+
circuit_breaker(key: "shared_api", threshold: 10)
|
|
981
|
+
end
|
|
982
|
+
|
|
983
|
+
job "fetch_products" do
|
|
984
|
+
runs_with ProductsAPIWorker
|
|
985
|
+
circuit_breaker(key: "shared_api", threshold: 10)
|
|
986
|
+
end
|
|
987
|
+
|
|
988
|
+
job "fetch_orders" do
|
|
989
|
+
runs_with OrdersAPIWorker
|
|
990
|
+
circuit_breaker(key: "shared_api", threshold: 10)
|
|
991
|
+
end
|
|
992
|
+
end
|
|
993
|
+
end
|
|
994
|
+
----
|
|
995
|
+
|
|
996
|
+
All failures across all three jobs count toward the shared threshold. When the circuit opens, all three jobs are blocked.
|
|
997
|
+
|
|
998
|
+
=== Combining Retry and Circuit Breaker
|
|
999
|
+
|
|
1000
|
+
For maximum resilience, combine both patterns:
|
|
1001
|
+
|
|
1002
|
+
[source,ruby]
|
|
1003
|
+
----
|
|
1004
|
+
class ResilientWorkflow < Fractor::Workflow
|
|
1005
|
+
workflow "resilient-workflow" do
|
|
1006
|
+
job "call_external_service" do
|
|
1007
|
+
runs_with ExternalServiceWorker
|
|
1008
|
+
|
|
1009
|
+
# Configure retry for transient failures
|
|
1010
|
+
retry_config(
|
|
1011
|
+
backoff: :exponential,
|
|
1012
|
+
base_delay: 1,
|
|
1013
|
+
max_delay: 10,
|
|
1014
|
+
max_attempts: 3
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
# Configure circuit breaker for persistent failures
|
|
1018
|
+
circuit_breaker(
|
|
1019
|
+
threshold: 5,
|
|
1020
|
+
timeout: 60,
|
|
1021
|
+
half_open_calls: 2
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
# Fallback when all retries and circuit breaker fail
|
|
1025
|
+
fallback_to "use_fallback"
|
|
1026
|
+
end
|
|
1027
|
+
|
|
1028
|
+
job "use_fallback" do
|
|
1029
|
+
runs_with FallbackWorker
|
|
1030
|
+
end
|
|
1031
|
+
end
|
|
1032
|
+
end
|
|
1033
|
+
----
|
|
1034
|
+
|
|
1035
|
+
**Execution flow:**
|
|
1036
|
+
|
|
1037
|
+
. Retry attempts 1-3 with exponential backoff (1s, 2s, 4s delays)
|
|
1038
|
+
. If all retries fail, increment circuit breaker failure count
|
|
1039
|
+
. After 5 circuit breaker failures, circuit opens
|
|
1040
|
+
. Subsequent calls immediately fail with `CircuitOpenError`
|
|
1041
|
+
. Fallback job executes
|
|
1042
|
+
. After 60 seconds, circuit moves to half-open
|
|
1043
|
+
. If 2 calls succeed, circuit closes and normal operation resumes
|
|
1044
|
+
|
|
1045
|
+
=== Dead Letter Queue Integration
|
|
1046
|
+
|
|
1047
|
+
Both orchestrators integrate with the Dead Letter Queue (DLQ) for failed work:
|
|
1048
|
+
|
|
1049
|
+
[source,ruby]
|
|
1050
|
+
----
|
|
1051
|
+
class DLQWorkflow < Fractor::Workflow
|
|
1052
|
+
workflow "dlq-workflow" do
|
|
1053
|
+
# Configure dead letter queue
|
|
1054
|
+
configure_dead_letter_queue(
|
|
1055
|
+
max_size: 1000,
|
|
1056
|
+
on_add: ->(entry) {
|
|
1057
|
+
# Notify team of failed work
|
|
1058
|
+
AlertService.notify("Work added to DLQ", entry)
|
|
1059
|
+
}
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
job "risky_operation" do
|
|
1063
|
+
runs_with RiskyWorker
|
|
1064
|
+
|
|
1065
|
+
retry_config(max_attempts: 3)
|
|
1066
|
+
circuit_breaker(threshold: 5)
|
|
1067
|
+
end
|
|
1068
|
+
end
|
|
1069
|
+
end
|
|
1070
|
+
----
|
|
1071
|
+
|
|
1072
|
+
When a job exhausts all retries, it's added to the DLQ with:
|
|
1073
|
+
|
|
1074
|
+
* Original work item
|
|
1075
|
+
* All errors from each retry attempt
|
|
1076
|
+
* Retry metadata (attempts, max attempts, total time)
|
|
1077
|
+
* Job and workflow context
|
|
1078
|
+
|
|
1079
|
+
=== Production Example
|
|
1080
|
+
|
|
1081
|
+
Complete example with monitoring and alerting:
|
|
1082
|
+
|
|
1083
|
+
[source,ruby]
|
|
1084
|
+
----
|
|
1085
|
+
class ProductionWorkflow < Fractor::Workflow
|
|
1086
|
+
workflow "production-api" do
|
|
1087
|
+
configure_dead_letter_queue(max_size: 5000)
|
|
1088
|
+
|
|
1089
|
+
job "primary_api_call" do
|
|
1090
|
+
runs_with PrimaryAPIWorker
|
|
1091
|
+
|
|
1092
|
+
# Retry configuration
|
|
1093
|
+
retry_config(
|
|
1094
|
+
backoff: :exponential,
|
|
1095
|
+
base_delay: 2,
|
|
1096
|
+
max_delay: 60,
|
|
1097
|
+
max_attempts: 5
|
|
1098
|
+
)
|
|
1099
|
+
retry_on [Timeout::Error, Errno::ECONNREFUSED, Errno::ETIMEDOUT]
|
|
1100
|
+
|
|
1101
|
+
# Circuit breaker configuration
|
|
1102
|
+
circuit_breaker(
|
|
1103
|
+
key: "production_api",
|
|
1104
|
+
threshold: 10,
|
|
1105
|
+
timeout: 120,
|
|
1106
|
+
half_open_calls: 5
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
# Fallback to secondary service
|
|
1110
|
+
fallback_to "secondary_api_call"
|
|
1111
|
+
|
|
1112
|
+
# Real-time error monitoring
|
|
1113
|
+
on_error do |error, context|
|
|
1114
|
+
ErrorReporter.record(
|
|
1115
|
+
WorkResult.new(error: error),
|
|
1116
|
+
job_name: "primary_api_call"
|
|
1117
|
+
)
|
|
1118
|
+
end
|
|
1119
|
+
end
|
|
1120
|
+
|
|
1121
|
+
job "secondary_api_call" do
|
|
1122
|
+
runs_with SecondaryAPIWorker
|
|
1123
|
+
|
|
1124
|
+
retry_config(max_attempts: 2)
|
|
1125
|
+
fallback_to "cached_response"
|
|
1126
|
+
end
|
|
1127
|
+
|
|
1128
|
+
job "cached_response" do
|
|
1129
|
+
runs_with CacheWorker
|
|
1130
|
+
end
|
|
1131
|
+
end
|
|
1132
|
+
end
|
|
1133
|
+
----
|
|
1134
|
+
|
|
1135
|
+
== Troubleshooting
|
|
1136
|
+
|
|
1137
|
+
=== Memory Usage
|
|
1138
|
+
|
|
1139
|
+
The ErrorReporter keeps the last 100 errors per category. For high-volume applications:
|
|
1140
|
+
|
|
1141
|
+
[source,ruby]
|
|
1142
|
+
----
|
|
1143
|
+
# Reset periodically
|
|
1144
|
+
reporter.reset
|
|
1145
|
+
|
|
1146
|
+
# Or implement custom cleanup
|
|
1147
|
+
class CustomErrorReporter < Fractor::ErrorReporter
|
|
1148
|
+
def record(work_result, job_name: nil)
|
|
1149
|
+
super
|
|
1150
|
+
cleanup_if_needed
|
|
1151
|
+
end
|
|
1152
|
+
|
|
1153
|
+
private
|
|
1154
|
+
|
|
1155
|
+
def cleanup_if_needed
|
|
1156
|
+
# Custom cleanup logic
|
|
1157
|
+
if total_errors > 10_000
|
|
1158
|
+
@mutex.synchronize do
|
|
1159
|
+
# Keep only recent categories
|
|
1160
|
+
@by_category.select! do |_, stats|
|
|
1161
|
+
stats.recent_errors.any? { |e| e[:timestamp] > 1.hour.ago }
|
|
1162
|
+
end
|
|
1163
|
+
end
|
|
1164
|
+
end
|
|
1165
|
+
end
|
|
1166
|
+
end
|
|
1167
|
+
----
|
|
1168
|
+
|
|
1169
|
+
=== Thread Safety
|
|
1170
|
+
|
|
1171
|
+
All ErrorReporter operations are thread-safe. No additional synchronization needed:
|
|
1172
|
+
|
|
1173
|
+
[source,ruby]
|
|
1174
|
+
----
|
|
1175
|
+
# Safe to use from multiple threads
|
|
1176
|
+
threads = 10.times.map do
|
|
1177
|
+
Thread.new do
|
|
1178
|
+
100.times do |i|
|
|
1179
|
+
result = process_work(i)
|
|
1180
|
+
reporter.record(result) # Thread-safe
|
|
1181
|
+
end
|
|
1182
|
+
end
|
|
1183
|
+
end
|
|
1184
|
+
|
|
1185
|
+
threads.each(&:join)
|
|
1186
|
+
----
|
|
1187
|
+
|
|
1188
|
+
== See Also
|
|
1189
|
+
|
|
1190
|
+
* link:../reference/api/[API Reference] - Complete API documentation
|
|
1191
|
+
* link:../pages/core-concepts/[Core Concepts] - Understanding WorkResult
|
|
1192
|
+
* link:../workflows/[Workflows] - Workflow error handling
|