fractor 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +227 -102
- data/README.adoc +113 -1940
- data/docs/.lycheeignore +16 -0
- data/docs/Gemfile +24 -0
- data/docs/README.md +157 -0
- data/docs/_config.yml +151 -0
- data/docs/_features/error-handling.adoc +1192 -0
- data/docs/_features/index.adoc +80 -0
- data/docs/_features/monitoring.adoc +589 -0
- data/docs/_features/signal-handling.adoc +202 -0
- data/docs/_features/workflows.adoc +1235 -0
- data/docs/_guides/continuous-mode.adoc +736 -0
- data/docs/_guides/cookbook.adoc +1133 -0
- data/docs/_guides/index.adoc +55 -0
- data/docs/_guides/pipeline-mode.adoc +730 -0
- data/docs/_guides/troubleshooting.adoc +358 -0
- data/docs/_pages/architecture.adoc +1390 -0
- data/docs/_pages/core-concepts.adoc +1392 -0
- data/docs/_pages/design-principles.adoc +862 -0
- data/docs/_pages/getting-started.adoc +290 -0
- data/docs/_pages/installation.adoc +143 -0
- data/docs/_reference/api.adoc +1080 -0
- data/docs/_reference/error-reporting.adoc +670 -0
- data/docs/_reference/examples.adoc +181 -0
- data/docs/_reference/index.adoc +96 -0
- data/docs/_reference/troubleshooting.adoc +862 -0
- data/docs/_tutorials/complex-workflows.adoc +1022 -0
- data/docs/_tutorials/data-processing-pipeline.adoc +740 -0
- data/docs/_tutorials/first-application.adoc +384 -0
- data/docs/_tutorials/index.adoc +48 -0
- data/docs/_tutorials/long-running-services.adoc +931 -0
- data/docs/assets/images/favicon-16.png +0 -0
- data/docs/assets/images/favicon-32.png +0 -0
- data/docs/assets/images/favicon-48.png +0 -0
- data/docs/assets/images/favicon.ico +0 -0
- data/docs/assets/images/favicon.png +0 -0
- data/docs/assets/images/favicon.svg +45 -0
- data/docs/assets/images/fractor-icon.svg +49 -0
- data/docs/assets/images/fractor-logo.svg +61 -0
- data/docs/index.adoc +131 -0
- data/docs/lychee.toml +39 -0
- data/examples/api_aggregator/README.adoc +627 -0
- data/examples/api_aggregator/api_aggregator.rb +376 -0
- data/examples/auto_detection/README.adoc +407 -29
- data/examples/continuous_chat_common/message_protocol.rb +1 -1
- data/examples/error_reporting.rb +207 -0
- data/examples/file_processor/README.adoc +170 -0
- data/examples/file_processor/file_processor.rb +615 -0
- data/examples/file_processor/sample_files/invalid.csv +1 -0
- data/examples/file_processor/sample_files/orders.xml +24 -0
- data/examples/file_processor/sample_files/products.json +23 -0
- data/examples/file_processor/sample_files/users.csv +6 -0
- data/examples/hierarchical_hasher/README.adoc +629 -41
- data/examples/image_processor/README.adoc +610 -0
- data/examples/image_processor/image_processor.rb +349 -0
- data/examples/image_processor/processed_images/sample_10_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_1_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_2_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_3_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_4_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_5_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_6_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_7_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_8_processed.jpg.json +12 -0
- data/examples/image_processor/processed_images/sample_9_processed.jpg.json +12 -0
- data/examples/image_processor/test_images/sample_1.png +1 -0
- data/examples/image_processor/test_images/sample_10.png +1 -0
- data/examples/image_processor/test_images/sample_2.png +1 -0
- data/examples/image_processor/test_images/sample_3.png +1 -0
- data/examples/image_processor/test_images/sample_4.png +1 -0
- data/examples/image_processor/test_images/sample_5.png +1 -0
- data/examples/image_processor/test_images/sample_6.png +1 -0
- data/examples/image_processor/test_images/sample_7.png +1 -0
- data/examples/image_processor/test_images/sample_8.png +1 -0
- data/examples/image_processor/test_images/sample_9.png +1 -0
- data/examples/log_analyzer/README.adoc +662 -0
- data/examples/log_analyzer/log_analyzer.rb +579 -0
- data/examples/log_analyzer/sample_logs/apache.log +20 -0
- data/examples/log_analyzer/sample_logs/json.log +15 -0
- data/examples/log_analyzer/sample_logs/nginx.log +15 -0
- data/examples/log_analyzer/sample_logs/rails.log +29 -0
- data/examples/multi_work_type/README.adoc +576 -26
- data/examples/performance_monitoring.rb +120 -0
- data/examples/pipeline_processing/README.adoc +740 -26
- data/examples/pipeline_processing/pipeline_processing.rb +2 -2
- data/examples/priority_work_example.rb +155 -0
- data/examples/producer_subscriber/README.adoc +889 -46
- data/examples/scatter_gather/README.adoc +829 -27
- data/examples/simple/README.adoc +347 -0
- data/examples/specialized_workers/README.adoc +622 -26
- data/examples/specialized_workers/specialized_workers.rb +44 -8
- data/examples/stream_processor/README.adoc +206 -0
- data/examples/stream_processor/stream_processor.rb +284 -0
- data/examples/web_scraper/README.adoc +625 -0
- data/examples/web_scraper/web_scraper.rb +285 -0
- data/examples/workflow/README.adoc +406 -0
- data/examples/workflow/circuit_breaker/README.adoc +360 -0
- data/examples/workflow/circuit_breaker/circuit_breaker_workflow.rb +225 -0
- data/examples/workflow/conditional/README.adoc +483 -0
- data/examples/workflow/conditional/conditional_workflow.rb +215 -0
- data/examples/workflow/dead_letter_queue/README.adoc +374 -0
- data/examples/workflow/dead_letter_queue/dead_letter_queue_workflow.rb +217 -0
- data/examples/workflow/fan_out/README.adoc +381 -0
- data/examples/workflow/fan_out/fan_out_workflow.rb +202 -0
- data/examples/workflow/retry/README.adoc +248 -0
- data/examples/workflow/retry/retry_workflow.rb +195 -0
- data/examples/workflow/simple_linear/README.adoc +267 -0
- data/examples/workflow/simple_linear/simple_linear_workflow.rb +175 -0
- data/examples/workflow/simplified/README.adoc +329 -0
- data/examples/workflow/simplified/simplified_workflow.rb +222 -0
- data/exe/fractor +10 -0
- data/lib/fractor/cli.rb +288 -0
- data/lib/fractor/configuration.rb +307 -0
- data/lib/fractor/continuous_server.rb +60 -65
- data/lib/fractor/error_formatter.rb +72 -0
- data/lib/fractor/error_report_generator.rb +152 -0
- data/lib/fractor/error_reporter.rb +244 -0
- data/lib/fractor/error_statistics.rb +147 -0
- data/lib/fractor/execution_tracer.rb +162 -0
- data/lib/fractor/logger.rb +230 -0
- data/lib/fractor/main_loop_handler.rb +406 -0
- data/lib/fractor/main_loop_handler3.rb +135 -0
- data/lib/fractor/main_loop_handler4.rb +299 -0
- data/lib/fractor/performance_metrics_collector.rb +181 -0
- data/lib/fractor/performance_monitor.rb +215 -0
- data/lib/fractor/performance_report_generator.rb +202 -0
- data/lib/fractor/priority_work.rb +93 -0
- data/lib/fractor/priority_work_queue.rb +189 -0
- data/lib/fractor/result_aggregator.rb +32 -0
- data/lib/fractor/shutdown_handler.rb +168 -0
- data/lib/fractor/signal_handler.rb +80 -0
- data/lib/fractor/supervisor.rb +382 -269
- data/lib/fractor/supervisor_logger.rb +88 -0
- data/lib/fractor/version.rb +1 -1
- data/lib/fractor/work.rb +12 -0
- data/lib/fractor/work_distribution_manager.rb +151 -0
- data/lib/fractor/work_queue.rb +20 -0
- data/lib/fractor/work_result.rb +181 -9
- data/lib/fractor/worker.rb +73 -0
- data/lib/fractor/workflow/builder.rb +210 -0
- data/lib/fractor/workflow/chain_builder.rb +169 -0
- data/lib/fractor/workflow/circuit_breaker.rb +183 -0
- data/lib/fractor/workflow/circuit_breaker_orchestrator.rb +208 -0
- data/lib/fractor/workflow/circuit_breaker_registry.rb +112 -0
- data/lib/fractor/workflow/dead_letter_queue.rb +334 -0
- data/lib/fractor/workflow/execution_hooks.rb +39 -0
- data/lib/fractor/workflow/execution_strategy.rb +225 -0
- data/lib/fractor/workflow/execution_trace.rb +134 -0
- data/lib/fractor/workflow/helpers.rb +191 -0
- data/lib/fractor/workflow/job.rb +290 -0
- data/lib/fractor/workflow/job_dependency_validator.rb +120 -0
- data/lib/fractor/workflow/logger.rb +110 -0
- data/lib/fractor/workflow/pre_execution_context.rb +193 -0
- data/lib/fractor/workflow/retry_config.rb +156 -0
- data/lib/fractor/workflow/retry_orchestrator.rb +184 -0
- data/lib/fractor/workflow/retry_strategy.rb +93 -0
- data/lib/fractor/workflow/structured_logger.rb +30 -0
- data/lib/fractor/workflow/type_compatibility_validator.rb +222 -0
- data/lib/fractor/workflow/visualizer.rb +211 -0
- data/lib/fractor/workflow/workflow_context.rb +132 -0
- data/lib/fractor/workflow/workflow_executor.rb +669 -0
- data/lib/fractor/workflow/workflow_result.rb +55 -0
- data/lib/fractor/workflow/workflow_validator.rb +295 -0
- data/lib/fractor/workflow.rb +333 -0
- data/lib/fractor/wrapped_ractor.rb +66 -101
- data/lib/fractor/wrapped_ractor3.rb +161 -0
- data/lib/fractor/wrapped_ractor4.rb +242 -0
- data/lib/fractor.rb +92 -4
- metadata +179 -6
- data/tests/sample.rb.bak +0 -309
- data/tests/sample_working.rb.bak +0 -209
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Overview
|
|
4
|
+
nav_order: 1
|
|
5
|
+
---
|
|
6
|
+
== Features
|
|
7
|
+
|
|
8
|
+
Fractor provides a rich set of features for building robust parallel processing systems.
|
|
9
|
+
|
|
10
|
+
== Available Features
|
|
11
|
+
|
|
12
|
+
=== Workflows
|
|
13
|
+
|
|
14
|
+
GitHub Actions-style declarative workflows for complex data processing pipelines.
|
|
15
|
+
|
|
16
|
+
* link:../workflows/[Workflows] - Comprehensive workflow system documentation
|
|
17
|
+
|
|
18
|
+
=== Error Handling and Resilience
|
|
19
|
+
|
|
20
|
+
Production-ready error handling with comprehensive analytics and reporting.
|
|
21
|
+
|
|
22
|
+
* link:error-handling/[Error Handling] - Error reporting and analytics
|
|
23
|
+
|
|
24
|
+
=== Monitoring
|
|
25
|
+
|
|
26
|
+
Real-time performance monitoring and metrics collection.
|
|
27
|
+
|
|
28
|
+
* link:../monitoring/[Monitoring] - Performance monitoring and analytics
|
|
29
|
+
|
|
30
|
+
=== Signal Handling
|
|
31
|
+
|
|
32
|
+
Production-ready signal handling for process control and monitoring.
|
|
33
|
+
|
|
34
|
+
* link:signal-handling/[Signal Handling] - Signal handling and process monitoring
|
|
35
|
+
|
|
36
|
+
== Feature Categories
|
|
37
|
+
|
|
38
|
+
=== Workflow System
|
|
39
|
+
|
|
40
|
+
* Declarative DSL for defining workflows
|
|
41
|
+
* Type-safe data flow between jobs
|
|
42
|
+
* Dependency management and topological sorting
|
|
43
|
+
* Multiple execution patterns (linear, fan-out/fan-in, conditional)
|
|
44
|
+
* Simplified syntax with smart defaults
|
|
45
|
+
* Structured logging and execution tracing
|
|
46
|
+
* Workflow visualization (Mermaid, DOT, ASCII)
|
|
47
|
+
|
|
48
|
+
=== Error Handling
|
|
49
|
+
|
|
50
|
+
* Automatic retry with backoff strategies
|
|
51
|
+
* Circuit breaker pattern
|
|
52
|
+
* Dead Letter Queue for failed work
|
|
53
|
+
* Error categorization and severity levels
|
|
54
|
+
* Real-time error handlers and alerts
|
|
55
|
+
* Trending error detection
|
|
56
|
+
* Multiple export formats (text, Prometheus, JSON)
|
|
57
|
+
|
|
58
|
+
=== Monitoring and Analytics
|
|
59
|
+
|
|
60
|
+
* Real-time performance metrics
|
|
61
|
+
* Throughput and latency tracking
|
|
62
|
+
* Worker utilization monitoring
|
|
63
|
+
* Queue depth tracking
|
|
64
|
+
* Prometheus integration
|
|
65
|
+
* JSON export for programmatic access
|
|
66
|
+
* Human-readable reports
|
|
67
|
+
|
|
68
|
+
=== Signal Handling
|
|
69
|
+
|
|
70
|
+
* SIGINT/SIGTERM for graceful shutdown
|
|
71
|
+
* SIGUSR1/SIGBREAK for status monitoring
|
|
72
|
+
* Cross-platform support (Unix/Linux/macOS/Windows)
|
|
73
|
+
* Process monitoring and health checks
|
|
74
|
+
* Logging support
|
|
75
|
+
|
|
76
|
+
== Next Steps
|
|
77
|
+
|
|
78
|
+
* Explore link:../guides/[Guides] for task-oriented documentation
|
|
79
|
+
* See link:../tutorials/[Tutorials] for step-by-step learning
|
|
80
|
+
* Reference link:../references/[References] for technical specifications
|
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Monitoring
|
|
4
|
+
nav_order: 5
|
|
5
|
+
---
|
|
6
|
+
== Monitoring
|
|
7
|
+
|
|
8
|
+
Fractor provides comprehensive monitoring capabilities for tracking performance, analyzing errors, and benchmarking your parallel processing applications.
|
|
9
|
+
|
|
10
|
+
== Overview
|
|
11
|
+
|
|
12
|
+
The monitoring system helps you:
|
|
13
|
+
|
|
14
|
+
* **Track Performance Metrics** in real-time
|
|
15
|
+
* **Analyze Error Patterns** across your application
|
|
16
|
+
* **Benchmark Workers** and optimize resource usage
|
|
17
|
+
* **Export Metrics** to monitoring systems
|
|
18
|
+
* **Respond to Issues** proactively
|
|
19
|
+
|
|
20
|
+
== Performance Monitoring
|
|
21
|
+
|
|
22
|
+
=== Purpose
|
|
23
|
+
|
|
24
|
+
The `PerformanceMonitor` class tracks real-time metrics for supervisors and workflows, providing insights into throughput, latency, worker utilization, and system health.
|
|
25
|
+
|
|
26
|
+
=== Quick Start
|
|
27
|
+
|
|
28
|
+
[source,ruby]
|
|
29
|
+
----
|
|
30
|
+
require 'fractor/performance_monitor'
|
|
31
|
+
|
|
32
|
+
# Create a supervisor
|
|
33
|
+
supervisor = Fractor::Supervisor.new(
|
|
34
|
+
worker_class: DataProcessor,
|
|
35
|
+
num_workers: 4,
|
|
36
|
+
max_queue_size: 100
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Attach performance monitor
|
|
40
|
+
monitor = Fractor::PerformanceMonitor.new(
|
|
41
|
+
supervisor,
|
|
42
|
+
sample_interval: 1.0 # Sample metrics every second
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Start monitoring
|
|
46
|
+
monitor.start
|
|
47
|
+
|
|
48
|
+
# Add work to supervisor
|
|
49
|
+
100.times do |i|
|
|
50
|
+
supervisor.add_work(Fractor::Work.new(payload: { id: i }))
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Wait for completion
|
|
54
|
+
sleep 5
|
|
55
|
+
|
|
56
|
+
# Get current snapshot
|
|
57
|
+
snapshot = monitor.snapshot
|
|
58
|
+
puts "Jobs processed: #{snapshot[:jobs_processed]}"
|
|
59
|
+
puts "Average latency: #{snapshot[:average_latency]}ms"
|
|
60
|
+
puts "Worker utilization: #{snapshot[:worker_utilization]}%"
|
|
61
|
+
|
|
62
|
+
# Generate human-readable report
|
|
63
|
+
puts monitor.report
|
|
64
|
+
|
|
65
|
+
# Stop monitoring
|
|
66
|
+
monitor.stop
|
|
67
|
+
----
|
|
68
|
+
|
|
69
|
+
=== Available Metrics
|
|
70
|
+
|
|
71
|
+
[cols="1,3"]
|
|
72
|
+
|===
|
|
73
|
+
|Metric |Description
|
|
74
|
+
|
|
75
|
+
|`jobs_processed`
|
|
76
|
+
|Total number of jobs completed
|
|
77
|
+
|
|
78
|
+
|`jobs_succeeded`
|
|
79
|
+
|Number of jobs that completed successfully
|
|
80
|
+
|
|
81
|
+
|`jobs_failed`
|
|
82
|
+
|Number of jobs that failed
|
|
83
|
+
|
|
84
|
+
|`average_latency`
|
|
85
|
+
|Mean job execution time in milliseconds
|
|
86
|
+
|
|
87
|
+
|`p50_latency`
|
|
88
|
+
|50th percentile latency (median) in milliseconds
|
|
89
|
+
|
|
90
|
+
|`p95_latency`
|
|
91
|
+
|95th percentile latency in milliseconds
|
|
92
|
+
|
|
93
|
+
|`p99_latency`
|
|
94
|
+
|99th percentile latency in milliseconds
|
|
95
|
+
|
|
96
|
+
|`throughput`
|
|
97
|
+
|Jobs processed per second
|
|
98
|
+
|
|
99
|
+
|`queue_depth`
|
|
100
|
+
|Current number of pending jobs in queue
|
|
101
|
+
|
|
102
|
+
|`worker_count`
|
|
103
|
+
|Total number of workers
|
|
104
|
+
|
|
105
|
+
|`active_workers`
|
|
106
|
+
|Number of workers currently processing jobs
|
|
107
|
+
|
|
108
|
+
|`worker_utilization`
|
|
109
|
+
|Percentage of workers actively processing (0-100)
|
|
110
|
+
|
|
111
|
+
|`memory_mb`
|
|
112
|
+
|Current process memory usage in megabytes
|
|
113
|
+
|
|
114
|
+
|`uptime`
|
|
115
|
+
|Monitor uptime in seconds
|
|
116
|
+
|===
|
|
117
|
+
|
|
118
|
+
=== Export Formats
|
|
119
|
+
|
|
120
|
+
==== Human-Readable Report
|
|
121
|
+
|
|
122
|
+
[source,ruby]
|
|
123
|
+
----
|
|
124
|
+
puts monitor.report
|
|
125
|
+
|
|
126
|
+
# Output:
|
|
127
|
+
# === Performance Report ===
|
|
128
|
+
# Uptime: 10.5s
|
|
129
|
+
#
|
|
130
|
+
# Jobs:
|
|
131
|
+
# Total: 150
|
|
132
|
+
# Succeeded: 145
|
|
133
|
+
# Failed: 5
|
|
134
|
+
# Success Rate: 96.67%
|
|
135
|
+
#
|
|
136
|
+
# Latency (ms):
|
|
137
|
+
# Average: 23.5
|
|
138
|
+
# p50: 20.0
|
|
139
|
+
# p95: 45.0
|
|
140
|
+
# p99: 67.0
|
|
141
|
+
#
|
|
142
|
+
# Throughput:
|
|
143
|
+
# Current: 14.3 jobs/sec
|
|
144
|
+
#
|
|
145
|
+
# Queue:
|
|
146
|
+
# Depth: 25 jobs
|
|
147
|
+
#
|
|
148
|
+
# Workers:
|
|
149
|
+
# Total: 4
|
|
150
|
+
# Active: 3
|
|
151
|
+
# Utilization: 75.00%
|
|
152
|
+
#
|
|
153
|
+
# Memory:
|
|
154
|
+
# Current: 127.5 MB
|
|
155
|
+
----
|
|
156
|
+
|
|
157
|
+
==== JSON Export
|
|
158
|
+
|
|
159
|
+
[source,ruby]
|
|
160
|
+
----
|
|
161
|
+
json_data = monitor.to_json
|
|
162
|
+
puts json_data
|
|
163
|
+
|
|
164
|
+
# Output:
|
|
165
|
+
# {
|
|
166
|
+
# "jobs_processed": 150,
|
|
167
|
+
# "jobs_succeeded": 145,
|
|
168
|
+
# "jobs_failed": 5,
|
|
169
|
+
# "average_latency": 23.5,
|
|
170
|
+
# "p50_latency": 20.0,
|
|
171
|
+
# "p95_latency": 45.0,
|
|
172
|
+
# "p99_latency": 67.0,
|
|
173
|
+
# "throughput": 14.3,
|
|
174
|
+
# "queue_depth": 25,
|
|
175
|
+
# "worker_count": 4,
|
|
176
|
+
# "active_workers": 3,
|
|
177
|
+
# "worker_utilization": 75.0,
|
|
178
|
+
# "memory_mb": 127.5,
|
|
179
|
+
# "uptime": 10.5
|
|
180
|
+
# }
|
|
181
|
+
----
|
|
182
|
+
|
|
183
|
+
==== Prometheus Format
|
|
184
|
+
|
|
185
|
+
[source,ruby]
|
|
186
|
+
----
|
|
187
|
+
puts monitor.to_prometheus
|
|
188
|
+
|
|
189
|
+
# Output:
|
|
190
|
+
# # HELP fractor_jobs_processed Total number of jobs processed
|
|
191
|
+
# # TYPE fractor_jobs_processed counter
|
|
192
|
+
# fractor_jobs_processed 150
|
|
193
|
+
#
|
|
194
|
+
# # HELP fractor_jobs_succeeded Number of jobs that succeeded
|
|
195
|
+
# # TYPE fractor_jobs_succeeded counter
|
|
196
|
+
# fractor_jobs_succeeded 145
|
|
197
|
+
#
|
|
198
|
+
# # HELP fractor_jobs_failed Number of jobs that failed
|
|
199
|
+
# # TYPE fractor_jobs_failed counter
|
|
200
|
+
# fractor_jobs_failed 5
|
|
201
|
+
#
|
|
202
|
+
# # HELP fractor_latency_average Average job latency in milliseconds
|
|
203
|
+
# # TYPE fractor_latency_average gauge
|
|
204
|
+
# fractor_latency_average 23.5
|
|
205
|
+
# ...
|
|
206
|
+
----
|
|
207
|
+
|
|
208
|
+
=== Integration with Prometheus
|
|
209
|
+
|
|
210
|
+
[source,ruby]
|
|
211
|
+
----
|
|
212
|
+
require 'webrick'
|
|
213
|
+
|
|
214
|
+
# Create monitor
|
|
215
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor)
|
|
216
|
+
monitor.start
|
|
217
|
+
|
|
218
|
+
# Create metrics endpoint
|
|
219
|
+
server = WEBrick::HTTPServer.new(Port: 9090)
|
|
220
|
+
server.mount_proc '/metrics' do |req, res|
|
|
221
|
+
res['Content-Type'] = 'text/plain; version=0.0.4'
|
|
222
|
+
res.body = monitor.to_prometheus
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Start server
|
|
226
|
+
trap('INT') { server.shutdown }
|
|
227
|
+
server.start
|
|
228
|
+
----
|
|
229
|
+
|
|
230
|
+
Configure Prometheus to scrape the endpoint:
|
|
231
|
+
|
|
232
|
+
[source,yaml]
|
|
233
|
+
----
|
|
234
|
+
scrape_configs:
|
|
235
|
+
- job_name: 'fractor'
|
|
236
|
+
static_configs:
|
|
237
|
+
- targets: ['localhost:9090']
|
|
238
|
+
scrape_interval: 15s
|
|
239
|
+
----
|
|
240
|
+
|
|
241
|
+
== Error Reporting
|
|
242
|
+
|
|
243
|
+
=== Purpose
|
|
244
|
+
|
|
245
|
+
The `ErrorReporter` class provides comprehensive error analytics, tracking error patterns, detecting trends, and enabling proactive issue resolution.
|
|
246
|
+
|
|
247
|
+
=== Quick Start
|
|
248
|
+
|
|
249
|
+
[source,ruby]
|
|
250
|
+
----
|
|
251
|
+
require 'fractor/error_reporter'
|
|
252
|
+
|
|
253
|
+
# Create an error reporter
|
|
254
|
+
reporter = Fractor::ErrorReporter.new
|
|
255
|
+
|
|
256
|
+
# Register critical error alerts
|
|
257
|
+
reporter.on_error do |work_result, job_name|
|
|
258
|
+
if work_result.critical?
|
|
259
|
+
PagerDuty.alert(work_result, job_name)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Record work results
|
|
264
|
+
reporter.record(work_result, job_name: "process_data")
|
|
265
|
+
|
|
266
|
+
# Generate reports
|
|
267
|
+
puts reporter.formatted_report
|
|
268
|
+
File.write("metrics.txt", reporter.to_prometheus)
|
|
269
|
+
----
|
|
270
|
+
|
|
271
|
+
=== Error Categorization
|
|
272
|
+
|
|
273
|
+
Fractor automatically categorizes errors:
|
|
274
|
+
|
|
275
|
+
[cols="1,2,3"]
|
|
276
|
+
|===
|
|
277
|
+
|Category |Error Types |Description
|
|
278
|
+
|
|
279
|
+
|`:validation`
|
|
280
|
+
|`ArgumentError`, `TypeError`
|
|
281
|
+
|Input validation errors
|
|
282
|
+
|
|
283
|
+
|`:timeout`
|
|
284
|
+
|`Timeout::Error`
|
|
285
|
+
|Operation timeout errors
|
|
286
|
+
|
|
287
|
+
|`:network`
|
|
288
|
+
|`SocketError`, `Errno::ECONNREFUSED`, `Errno::ETIMEDOUT`
|
|
289
|
+
|Network-related errors
|
|
290
|
+
|
|
291
|
+
|`:resource`
|
|
292
|
+
|`Errno::ENOMEM`, `Errno::ENOSPC`
|
|
293
|
+
|Resource exhaustion errors
|
|
294
|
+
|
|
295
|
+
|`:system`
|
|
296
|
+
|`SystemCallError`, `SystemStackError`
|
|
297
|
+
|System-level errors
|
|
298
|
+
|
|
299
|
+
|`:business`
|
|
300
|
+
|Custom business logic errors
|
|
301
|
+
|Application-specific errors
|
|
302
|
+
|
|
303
|
+
|`:unknown`
|
|
304
|
+
|Other errors
|
|
305
|
+
|Uncategorized errors
|
|
306
|
+
|===
|
|
307
|
+
|
|
308
|
+
=== Error Severity Levels
|
|
309
|
+
|
|
310
|
+
[cols="1,3"]
|
|
311
|
+
|===
|
|
312
|
+
|Severity |Description
|
|
313
|
+
|
|
314
|
+
|`:critical`
|
|
315
|
+
|System-breaking errors requiring immediate attention
|
|
316
|
+
|
|
317
|
+
|`:error`
|
|
318
|
+
|Standard errors that prevent operation completion
|
|
319
|
+
|
|
320
|
+
|`:warning`
|
|
321
|
+
|Non-fatal issues that may need investigation
|
|
322
|
+
|
|
323
|
+
|`:info`
|
|
324
|
+
|Informational messages
|
|
325
|
+
|===
|
|
326
|
+
|
|
327
|
+
=== Real-Time Error Handlers
|
|
328
|
+
|
|
329
|
+
[source,ruby]
|
|
330
|
+
----
|
|
331
|
+
reporter.on_error do |work_result, job_name|
|
|
332
|
+
case work_result.error_severity
|
|
333
|
+
when :critical
|
|
334
|
+
PagerDuty.alert(work_result, job_name)
|
|
335
|
+
when :error
|
|
336
|
+
Slack.notify(work_result, job_name)
|
|
337
|
+
when :warning
|
|
338
|
+
Logger.warn("#{job_name}: #{work_result.error.message}")
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
----
|
|
342
|
+
|
|
343
|
+
=== Trend Detection
|
|
344
|
+
|
|
345
|
+
[source,ruby]
|
|
346
|
+
----
|
|
347
|
+
# Get all trending errors
|
|
348
|
+
trending = reporter.trending_errors
|
|
349
|
+
|
|
350
|
+
trending.each do |trend|
|
|
351
|
+
category = trend[:category]
|
|
352
|
+
stats = trend[:stats]
|
|
353
|
+
|
|
354
|
+
puts "⚠️ #{category} errors are increasing!"
|
|
355
|
+
puts " Count: #{stats[:total_count]}"
|
|
356
|
+
puts " Rate: #{stats[:error_rate]}/s"
|
|
357
|
+
end
|
|
358
|
+
----
|
|
359
|
+
|
|
360
|
+
== Benchmarking
|
|
361
|
+
|
|
362
|
+
=== Worker Performance Benchmarking
|
|
363
|
+
|
|
364
|
+
[source,ruby]
|
|
365
|
+
----
|
|
366
|
+
require 'benchmark'
|
|
367
|
+
|
|
368
|
+
# Benchmark different worker configurations
|
|
369
|
+
Benchmark.bm(20) do |x|
|
|
370
|
+
x.report("2 workers:") do
|
|
371
|
+
supervisor = Fractor::Supervisor.new(
|
|
372
|
+
worker_pools: [{ worker_class: MyWorker, num_workers: 2 }]
|
|
373
|
+
)
|
|
374
|
+
supervisor.add_work_items(work_items)
|
|
375
|
+
supervisor.run
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
x.report("4 workers:") do
|
|
379
|
+
supervisor = Fractor::Supervisor.new(
|
|
380
|
+
worker_pools: [{ worker_class: MyWorker, num_workers: 4 }]
|
|
381
|
+
)
|
|
382
|
+
supervisor.add_work_items(work_items)
|
|
383
|
+
supervisor.run
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
x.report("8 workers:") do
|
|
387
|
+
supervisor = Fractor::Supervisor.new(
|
|
388
|
+
worker_pools: [{ worker_class: MyWorker, num_workers: 8 }]
|
|
389
|
+
)
|
|
390
|
+
supervisor.add_work_items(work_items)
|
|
391
|
+
supervisor.run
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
----
|
|
395
|
+
|
|
396
|
+
=== Memory Profiling
|
|
397
|
+
|
|
398
|
+
[source,ruby]
|
|
399
|
+
----
|
|
400
|
+
require 'memory_profiler'
|
|
401
|
+
|
|
402
|
+
report = MemoryProfiler.report do
|
|
403
|
+
supervisor = Fractor::Supervisor.new(
|
|
404
|
+
worker_pools: [{ worker_class: MyWorker, num_workers: 4 }]
|
|
405
|
+
)
|
|
406
|
+
supervisor.add_work_items(work_items)
|
|
407
|
+
supervisor.run
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
report.pretty_print
|
|
411
|
+
----
|
|
412
|
+
|
|
413
|
+
=== Queue Performance Analysis
|
|
414
|
+
|
|
415
|
+
[source,ruby]
|
|
416
|
+
----
|
|
417
|
+
# Monitor queue depth over time
|
|
418
|
+
queue_depths = []
|
|
419
|
+
|
|
420
|
+
Thread.new do
|
|
421
|
+
loop do
|
|
422
|
+
queue_depths << supervisor.work_queue.size
|
|
423
|
+
sleep 0.1
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
supervisor.run
|
|
428
|
+
|
|
429
|
+
# Analyze queue behavior
|
|
430
|
+
puts "Max queue depth: #{queue_depths.max}"
|
|
431
|
+
puts "Average queue depth: #{queue_depths.sum / queue_depths.size}"
|
|
432
|
+
puts "Queue was empty: #{queue_depths.count(0)} times"
|
|
433
|
+
----
|
|
434
|
+
|
|
435
|
+
== Best Practices
|
|
436
|
+
|
|
437
|
+
=== Set Appropriate Sample Intervals
|
|
438
|
+
|
|
439
|
+
[source,ruby]
|
|
440
|
+
----
|
|
441
|
+
# High-frequency monitoring (testing)
|
|
442
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor, sample_interval: 0.1)
|
|
443
|
+
|
|
444
|
+
# Normal monitoring (production)
|
|
445
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor, sample_interval: 1.0)
|
|
446
|
+
|
|
447
|
+
# Low-frequency monitoring (low overhead)
|
|
448
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor, sample_interval: 5.0)
|
|
449
|
+
----
|
|
450
|
+
|
|
451
|
+
=== Monitor Critical Metrics
|
|
452
|
+
|
|
453
|
+
[source,ruby]
|
|
454
|
+
----
|
|
455
|
+
# Set up alerts for critical thresholds
|
|
456
|
+
Thread.new do
|
|
457
|
+
loop do
|
|
458
|
+
sleep 60
|
|
459
|
+
|
|
460
|
+
snapshot = monitor.snapshot
|
|
461
|
+
|
|
462
|
+
# Alert on high error rate
|
|
463
|
+
if snapshot[:jobs_failed].to_f / snapshot[:jobs_processed] > 0.1
|
|
464
|
+
AlertService.notify("High error rate detected!")
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Alert on low throughput
|
|
468
|
+
if snapshot[:throughput] < 10.0
|
|
469
|
+
AlertService.notify("Low throughput detected!")
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
# Alert on high latency
|
|
473
|
+
if snapshot[:p95_latency] > 1000.0
|
|
474
|
+
AlertService.notify("High latency detected!")
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
----
|
|
479
|
+
|
|
480
|
+
=== Regular Data Cleanup
|
|
481
|
+
|
|
482
|
+
[source,ruby]
|
|
483
|
+
----
|
|
484
|
+
# Reset statistics periodically to prevent unbounded growth
|
|
485
|
+
Thread.new do
|
|
486
|
+
loop do
|
|
487
|
+
sleep 86400 # 24 hours
|
|
488
|
+
|
|
489
|
+
# Archive current stats
|
|
490
|
+
File.write(
|
|
491
|
+
"metrics_#{Date.today}.json",
|
|
492
|
+
monitor.to_json
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Reset for new day
|
|
496
|
+
reporter.reset
|
|
497
|
+
Logger.info("Monitoring statistics reset")
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
----
|
|
501
|
+
|
|
502
|
+
=== Use Structured Logging
|
|
503
|
+
|
|
504
|
+
[source,ruby]
|
|
505
|
+
----
|
|
506
|
+
require 'json'
|
|
507
|
+
require 'logger'
|
|
508
|
+
|
|
509
|
+
logger = Logger.new(STDOUT)
|
|
510
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
|
511
|
+
JSON.generate({
|
|
512
|
+
timestamp: datetime.iso8601,
|
|
513
|
+
severity: severity,
|
|
514
|
+
message: msg,
|
|
515
|
+
metrics: monitor.snapshot
|
|
516
|
+
}) + "\n"
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
logger.info("Processing batch")
|
|
520
|
+
----
|
|
521
|
+
|
|
522
|
+
== Production Integration
|
|
523
|
+
|
|
524
|
+
=== With Supervisors
|
|
525
|
+
|
|
526
|
+
[source,ruby]
|
|
527
|
+
----
|
|
528
|
+
supervisor = Fractor::Supervisor.new(
|
|
529
|
+
worker_pools: [{ worker_class: MyWorker, num_workers: 4 }]
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor)
|
|
533
|
+
reporter = Fractor::ErrorReporter.new
|
|
534
|
+
|
|
535
|
+
monitor.start
|
|
536
|
+
|
|
537
|
+
# Record all results
|
|
538
|
+
supervisor.on_result do |result|
|
|
539
|
+
reporter.record(result)
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
supervisor.run
|
|
543
|
+
monitor.stop
|
|
544
|
+
|
|
545
|
+
# Generate final reports
|
|
546
|
+
puts monitor.report
|
|
547
|
+
puts reporter.formatted_report
|
|
548
|
+
----
|
|
549
|
+
|
|
550
|
+
=== With Workflows
|
|
551
|
+
|
|
552
|
+
[source,ruby]
|
|
553
|
+
----
|
|
554
|
+
class MonitoredWorkflow < Fractor::Workflow
|
|
555
|
+
workflow "monitored" do
|
|
556
|
+
job "process" do
|
|
557
|
+
runs_with ProcessWorker
|
|
558
|
+
|
|
559
|
+
on_error do |error, context|
|
|
560
|
+
reporter.record(
|
|
561
|
+
Fractor::WorkResult.new(
|
|
562
|
+
error: error,
|
|
563
|
+
error_context: context
|
|
564
|
+
),
|
|
565
|
+
job_name: "process"
|
|
566
|
+
)
|
|
567
|
+
end
|
|
568
|
+
end
|
|
569
|
+
end
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
workflow = MonitoredWorkflow.new(input_data)
|
|
573
|
+
supervisor = workflow.supervisor
|
|
574
|
+
|
|
575
|
+
monitor = Fractor::PerformanceMonitor.new(supervisor)
|
|
576
|
+
monitor.start
|
|
577
|
+
|
|
578
|
+
result = workflow.execute
|
|
579
|
+
monitor.stop
|
|
580
|
+
|
|
581
|
+
puts monitor.report
|
|
582
|
+
----
|
|
583
|
+
|
|
584
|
+
== See Also
|
|
585
|
+
|
|
586
|
+
* link:error-handling/[Error Handling] - Comprehensive error analytics
|
|
587
|
+
* link:../guides/pipeline-mode/[Pipeline Mode] - Batch processing patterns
|
|
588
|
+
* link:../guides/continuous-mode/[Continuous Mode] - Long-running servers
|
|
589
|
+
* link:../reference/api/[API Reference] - Complete API documentation
|