async-service-supervisor 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data/bake/async/service/supervisor.rb +45 -0
- data/context/getting-started.md +205 -0
- data/context/index.yaml +20 -0
- data/context/memory-monitor.md +129 -0
- data/context/process-monitor.md +91 -0
- data/lib/async/service/supervisor/client.rb +22 -0
- data/lib/async/service/supervisor/endpoint.rb +21 -0
- data/lib/async/service/supervisor/environment.rb +75 -0
- data/lib/async/service/supervisor/loop.rb +37 -0
- data/lib/async/service/supervisor/memory_monitor.rb +139 -0
- data/lib/async/service/supervisor/process_monitor.rb +88 -0
- data/lib/async/service/supervisor/server.rb +120 -0
- data/lib/async/service/supervisor/service.rb +74 -0
- data/lib/async/service/supervisor/supervised.rb +45 -0
- data/lib/async/service/supervisor/supervisor_controller.rb +127 -0
- data/lib/async/service/supervisor/version.rb +16 -0
- data/lib/async/service/supervisor/worker.rb +58 -0
- data/lib/async/service/supervisor/worker_controller.rb +107 -0
- data/lib/async/service/supervisor.rb +18 -0
- data/license.md +21 -0
- data/readme.md +92 -0
- data/releases.md +48 -0
- data.tar.gz.sig +2 -0
- metadata +174 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: e5058893b1b7ea249bcd4e273f169fb9a4649523380d545c08a281f20d6f56c0
|
|
4
|
+
data.tar.gz: d17a7bc6d6443ffd962118ae45d1dbfa6cf7a890d5b91ca1d74c9f050d45177b
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 344766f9eb24ca6fae65844a8e5654fb197b829efe89363b0bdff1e9bfd322fcc1e0f1b121ef1ae2579d03174b14226c0a2f76a5de3a7e3ceff4ff6f48416366
|
|
7
|
+
data.tar.gz: 2efa90fef449b5a0b52ea738a4f078a32c9df70d41a2d42dcf3e7e0d02b1f67c6290bca1b5c073ef838e482b56abd97d9d2e1fe1019e383c62659946df52cee1
|
checksums.yaml.gz.sig
ADDED
|
Binary file
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Released under the MIT License.
|
|
4
|
+
# Copyright, 2025, by Samuel Williams.
|
|
5
|
+
|
|
6
|
+
def initialize(...)
|
|
7
|
+
super
|
|
8
|
+
|
|
9
|
+
require "async/service/supervisor"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Restart the container, typically causing it to exit (the parent process should then restart it).
|
|
13
|
+
def restart
|
|
14
|
+
client do |connection|
|
|
15
|
+
supervisor = connection[:supervisor]
|
|
16
|
+
supervisor.restart
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Reload the services gracefully, allowing them to reconfigure without dropping connections.
|
|
21
|
+
def reload
|
|
22
|
+
client do |connection|
|
|
23
|
+
supervisor = connection[:supervisor]
|
|
24
|
+
supervisor.restart(signal: :HUP)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def status
|
|
29
|
+
client do |connection|
|
|
30
|
+
supervisor = connection[:supervisor]
|
|
31
|
+
supervisor.status
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def endpoint
|
|
38
|
+
Async::Service::Supervisor.endpoint
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def client(&block)
|
|
42
|
+
Sync do
|
|
43
|
+
Async::Service::Supervisor::Client.new(endpoint: self.endpoint).connect(&block)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Getting Started
|
|
2
|
+
|
|
3
|
+
This guide explains how to get started with `async-service-supervisor` to supervise and monitor worker processes in your Ruby applications.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add the gem to your project:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
$ bundle add async-service-supervisor
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Core Concepts
|
|
14
|
+
|
|
15
|
+
`async-service-supervisor` provides a robust process supervision system built on top of {ruby Async::Service::Generic}. The key components are:
|
|
16
|
+
|
|
17
|
+
- {ruby Async::Service::Supervisor::Environment}: An environment mixin that sets up a supervisor service in your application.
|
|
18
|
+
- {ruby Async::Service::Supervisor::Supervised}: An environment mixin that enables workers to connect to and be supervised by the supervisor.
|
|
19
|
+
- {ruby Async::Service::Supervisor::Server}: The server that handles communication with workers and performs monitoring.
|
|
20
|
+
- {ruby Async::Service::Supervisor::Worker}: A client that connects workers to the supervisor for health monitoring and diagnostics.
|
|
21
|
+
|
|
22
|
+
### Process Architecture
|
|
23
|
+
|
|
24
|
+
The supervisor operates as a multi-process architecture with three layers:
|
|
25
|
+
|
|
26
|
+
```mermaid
|
|
27
|
+
graph TD
|
|
28
|
+
Controller[Async::Container::Controller<br/>Root Process]
|
|
29
|
+
|
|
30
|
+
Controller -->|spawns & manages| Supervisor[Supervisor Process<br/>async-service-supervisor]
|
|
31
|
+
Controller -->|spawns & manages| Worker1[Worker Process 1]
|
|
32
|
+
Controller -->|spawns & manages| Worker2[Worker Process 2]
|
|
33
|
+
Controller -->|spawns & manages| WorkerN[Worker Process N...]
|
|
34
|
+
|
|
35
|
+
Worker1 -.->|connects via IPC| Supervisor
|
|
36
|
+
Worker2 -.->|connects via IPC| Supervisor
|
|
37
|
+
WorkerN -.->|connects via IPC| Supervisor
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Important:** The supervisor process is itself just another process managed by the root controller. If the supervisor crashes, the controller will restart it, and all worker processes will automatically reconnect to the new supervisor. This design ensures high availability and fault tolerance.
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
To use the supervisor, you need to define two services: one for the supervisor itself and one for your workers that will be supervised.
|
|
45
|
+
|
|
46
|
+
### Basic Example
|
|
47
|
+
|
|
48
|
+
Create a service configuration file (e.g., `service.rb`):
|
|
49
|
+
|
|
50
|
+
```ruby
|
|
51
|
+
#!/usr/bin/env async-service
|
|
52
|
+
# frozen_string_literal: true
|
|
53
|
+
|
|
54
|
+
require "async/service/supervisor"
|
|
55
|
+
|
|
56
|
+
class MyWorkerService < Async::Service::Generic
|
|
57
|
+
def setup(container)
|
|
58
|
+
super
|
|
59
|
+
|
|
60
|
+
container.run(name: self.class.name, count: 4, restart: true) do |instance|
|
|
61
|
+
Async do
|
|
62
|
+
# Get the environment evaluator:
|
|
63
|
+
evaluator = self.environment.evaluator
|
|
64
|
+
|
|
65
|
+
# Prepare the instance (connects to supervisor if available):
|
|
66
|
+
evaluator.prepare!(instance)
|
|
67
|
+
|
|
68
|
+
# Mark the worker as ready:
|
|
69
|
+
instance.ready!
|
|
70
|
+
|
|
71
|
+
# Your worker logic here:
|
|
72
|
+
loop do
|
|
73
|
+
# Do work...
|
|
74
|
+
sleep 1
|
|
75
|
+
|
|
76
|
+
# Periodically update readiness:
|
|
77
|
+
instance.ready!
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Define the worker service:
|
|
85
|
+
service "worker" do
|
|
86
|
+
service_class MyWorkerService
|
|
87
|
+
|
|
88
|
+
# Enable supervision for this service:
|
|
89
|
+
include Async::Service::Supervisor::Supervised
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Define the supervisor service:
|
|
93
|
+
service "supervisor" do
|
|
94
|
+
include Async::Service::Supervisor::Environment
|
|
95
|
+
end
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Running the Service
|
|
99
|
+
|
|
100
|
+
Make the service executable and run it:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
$ chmod +x service.rb
|
|
104
|
+
$ ./service.rb
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
This will start:
|
|
108
|
+
- A supervisor process listening on a Unix socket
|
|
109
|
+
- Four worker processes that connect to the supervisor
|
|
110
|
+
|
|
111
|
+
### Adding Health Monitors
|
|
112
|
+
|
|
113
|
+
You can add monitors to observe worker health and automatically respond to issues. Monitors are useful for:
|
|
114
|
+
|
|
115
|
+
- **Memory leak detection**: Automatically restart workers consuming excessive memory.
|
|
116
|
+
- **Performance monitoring**: Track CPU and memory usage trends.
|
|
117
|
+
- **Capacity planning**: Understand resource requirements.
|
|
118
|
+
|
|
119
|
+
For example, to add monitoring:
|
|
120
|
+
|
|
121
|
+
```ruby
|
|
122
|
+
service "supervisor" do
|
|
123
|
+
include Async::Service::Supervisor::Environment
|
|
124
|
+
|
|
125
|
+
monitors do
|
|
126
|
+
[
|
|
127
|
+
# Log process metrics for observability:
|
|
128
|
+
Async::Service::Supervisor::ProcessMonitor.new(
|
|
129
|
+
interval: 60
|
|
130
|
+
),
|
|
131
|
+
|
|
132
|
+
# Restart workers exceeding memory limits:
|
|
133
|
+
Async::Service::Supervisor::MemoryMonitor.new(
|
|
134
|
+
interval: 10,
|
|
135
|
+
maximum_size_limit: 1024 * 1024 * 500 # 500MB limit per process
|
|
136
|
+
)
|
|
137
|
+
]
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
See the {ruby Async::Service::Supervisor::MemoryMonitor Memory Monitor} and {ruby Async::Service::Supervisor::ProcessMonitor Process Monitor} guides for detailed configuration options and best practices.
|
|
143
|
+
|
|
144
|
+
### Collecting Diagnostics
|
|
145
|
+
|
|
146
|
+
The supervisor can collect various diagnostics from workers on demand:
|
|
147
|
+
|
|
148
|
+
- **Memory dumps**: Full heap dumps for memory analysis via `ObjectSpace.dump_all`.
|
|
149
|
+
- **Memory samples**: Lightweight sampling to identify memory leaks.
|
|
150
|
+
- **Thread dumps**: Stack traces of all threads.
|
|
151
|
+
- **Scheduler dumps**: Async fiber hierarchy
|
|
152
|
+
- **Garbage collection profiles**: GC performance data
|
|
153
|
+
|
|
154
|
+
These can be triggered programmatically or via command-line tools (when available).
|
|
155
|
+
|
|
156
|
+
#### Memory Leak Diagnosis
|
|
157
|
+
|
|
158
|
+
To identify memory leaks, you can use the memory sampling feature which is much lighter weight than a full memory dump. It tracks allocations over a time period and focuses on retained objects.
|
|
159
|
+
|
|
160
|
+
**Using the bake task:**
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Sample for 30 seconds and print report to console
|
|
164
|
+
$ bake async:container:supervisor:memory_sample duration=30
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Programmatically:**
|
|
168
|
+
|
|
169
|
+
```ruby
|
|
170
|
+
# Assuming you have a connection to a worker:
|
|
171
|
+
result = connection.call(do: :memory_sample, duration: 30)
|
|
172
|
+
puts result[:data]
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
This will sample memory allocations for the specified duration, then force a garbage collection and return a JSON report showing what objects were allocated during that period and retained after GC. Late-lifecycle allocations that are retained are likely memory leaks.
|
|
176
|
+
|
|
177
|
+
The JSON report includes:
|
|
178
|
+
- `total_allocated`: Total allocated memory and count
|
|
179
|
+
- `total_retained`: Total retained memory and count
|
|
180
|
+
- `by_gem`: Breakdown by gem/library
|
|
181
|
+
- `by_file`: Breakdown by source file
|
|
182
|
+
- `by_location`: Breakdown by specific file:line locations
|
|
183
|
+
- `by_class`: Breakdown by object class
|
|
184
|
+
- `strings`: String allocation analysis
|
|
185
|
+
|
|
186
|
+
This is much more efficient than `do: :memory_dump` which uses `ObjectSpace.dump_all` and can be slow and blocking on large heaps. The JSON format also makes it easy to integrate with monitoring and analysis tools.
|
|
187
|
+
|
|
188
|
+
## Advanced Usage
|
|
189
|
+
|
|
190
|
+
### Custom Monitors
|
|
191
|
+
|
|
192
|
+
You can create custom monitors by implementing the monitor interface. A monitor should:
|
|
193
|
+
|
|
194
|
+
1. Accept connections and periodically check worker health
|
|
195
|
+
2. Take action (like restarting workers) when unhealthy conditions are detected
|
|
196
|
+
|
|
197
|
+
### Fault Tolerance
|
|
198
|
+
|
|
199
|
+
The supervisor architecture is designed for fault tolerance:
|
|
200
|
+
|
|
201
|
+
- **Supervisor crashes**: When the supervisor process crashes, the root controller automatically restarts it. Workers detect the disconnection and reconnect to the new supervisor.
|
|
202
|
+
- **Worker crashes**: The container automatically restarts crashed workers based on the `restart: true` configuration.
|
|
203
|
+
- **Communication failures**: Workers gracefully handle supervisor unavailability and will attempt to reconnect.
|
|
204
|
+
|
|
205
|
+
This design ensures your application remains operational even when individual processes fail.
|
data/context/index.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Automatically generated context index for Utopia::Project guides.
|
|
2
|
+
# Do not edit then files in this directory directly, instead edit the guides and then run `bake utopia:project:agent:context:update`.
|
|
3
|
+
---
|
|
4
|
+
description: A supervisor for managing multiple container processes.
|
|
5
|
+
metadata:
|
|
6
|
+
documentation_uri: https://socketry.github.io/async-service-supervisor/
|
|
7
|
+
source_code_uri: https://github.com/socketry/async-service-supervisor.git
|
|
8
|
+
files:
|
|
9
|
+
- path: getting-started.md
|
|
10
|
+
title: Getting Started
|
|
11
|
+
description: This guide explains how to get started with `async-service-supervisor`
|
|
12
|
+
to supervise and monitor worker processes in your Ruby applications.
|
|
13
|
+
- path: memory-monitor.md
|
|
14
|
+
title: Memory Monitor
|
|
15
|
+
description: This guide explains how to use the <code class="language-ruby">Async::Service::Supervisor::MemoryMonitor</code>
|
|
16
|
+
to detect and restart workers that exceed memory limits or develop memory leaks.
|
|
17
|
+
- path: process-monitor.md
|
|
18
|
+
title: Process Monitor
|
|
19
|
+
description: This guide explains how to use the <code class="language-ruby">Async::Service::Supervisor::ProcessMonitor</code>
|
|
20
|
+
to log CPU and memory metrics for your worker processes.
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Memory Monitor
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the {ruby Async::Service::Supervisor::MemoryMonitor} to detect and restart workers that exceed memory limits or develop memory leaks.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Long-running worker processes often accumulate memory over time, either through legitimate growth or memory leaks. Without intervention, workers can consume all available system memory, causing performance degradation or system crashes. The `MemoryMonitor` solves this by automatically detecting and restarting problematic workers before they impact system stability.
|
|
8
|
+
|
|
9
|
+
Use the `MemoryMonitor` when you need:
|
|
10
|
+
|
|
11
|
+
- **Memory leak protection**: Automatically restart workers that continuously accumulate memory.
|
|
12
|
+
- **Resource limits**: Enforce maximum memory usage per worker.
|
|
13
|
+
- **System stability**: Prevent runaway processes from exhausting system memory.
|
|
14
|
+
- **Leak diagnosis**: Capture memory samples when leaks are detected for debugging.
|
|
15
|
+
|
|
16
|
+
The monitor uses the `memory-leak` gem to track process memory usage over time, detecting abnormal growth patterns that indicate leaks.
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
Add a memory monitor to your supervisor service to automatically restart workers that exceed 500MB:
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
service "supervisor" do
|
|
24
|
+
include Async::Service::Supervisor::Environment
|
|
25
|
+
|
|
26
|
+
monitors do
|
|
27
|
+
[
|
|
28
|
+
Async::Service::Supervisor::MemoryMonitor.new(
|
|
29
|
+
# Check worker memory every 10 seconds:
|
|
30
|
+
interval: 10,
|
|
31
|
+
|
|
32
|
+
# Restart workers exceeding 500MB:
|
|
33
|
+
maximum_size_limit: 1024 * 1024 * 500
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
When a worker exceeds the limit:
|
|
41
|
+
1. The monitor logs the leak detection.
|
|
42
|
+
2. Optionally captures a memory sample for debugging.
|
|
43
|
+
3. Sends `SIGINT` to gracefully shut down the worker.
|
|
44
|
+
4. The container automatically spawns a replacement worker.
|
|
45
|
+
|
|
46
|
+
## Configuration Options
|
|
47
|
+
|
|
48
|
+
The `MemoryMonitor` accepts the following options:
|
|
49
|
+
|
|
50
|
+
### `interval`
|
|
51
|
+
|
|
52
|
+
The interval (in seconds) at which to check for memory leaks. Default: `10` seconds.
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
Async::Service::Supervisor::MemoryMonitor.new(interval: 30)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### `maximum_size_limit`
|
|
59
|
+
|
|
60
|
+
The maximum memory size (in bytes) per process. When a process exceeds this limit, it will be restarted.
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
# 500MB limit
|
|
64
|
+
Async::Service::Supervisor::MemoryMonitor.new(maximum_size_limit: 1024 * 1024 * 500)
|
|
65
|
+
|
|
66
|
+
# 1GB limit
|
|
67
|
+
Async::Service::Supervisor::MemoryMonitor.new(maximum_size_limit: 1024 * 1024 * 1024)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### `total_size_limit`
|
|
71
|
+
|
|
72
|
+
The total size limit (in bytes) for all monitored processes combined. If not specified, only per-process limits are enforced.
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
# Total limit of 2GB across all workers
|
|
76
|
+
Async::Service::Supervisor::MemoryMonitor.new(
|
|
77
|
+
maximum_size_limit: 1024 * 1024 * 500, # 500MB per process
|
|
78
|
+
total_size_limit: 1024 * 1024 * 1024 * 2 # 2GB total
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### `memory_sample`
|
|
83
|
+
|
|
84
|
+
Options for capturing memory samples when a leak is detected. If `nil`, memory sampling is disabled.
|
|
85
|
+
|
|
86
|
+
Default: `{duration: 30, timeout: 120}`
|
|
87
|
+
|
|
88
|
+
```ruby
|
|
89
|
+
# Customize memory sampling:
|
|
90
|
+
Async::Service::Supervisor::MemoryMonitor.new(
|
|
91
|
+
memory_sample: {
|
|
92
|
+
duration: 60, # Sample for 60 seconds
|
|
93
|
+
timeout: 180 # Timeout after 180 seconds
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Disable memory sampling:
|
|
98
|
+
Async::Service::Supervisor::MemoryMonitor.new(
|
|
99
|
+
memory_sample: nil
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Memory Leak Detection
|
|
104
|
+
|
|
105
|
+
When a memory leak is detected, the monitor will:
|
|
106
|
+
|
|
107
|
+
1. Log the leak detection with process details.
|
|
108
|
+
2. If `memory_sample` is configured, capture a memory sample from the worker.
|
|
109
|
+
3. Send a `SIGINT` signal to gracefully restart the worker.
|
|
110
|
+
4. The container will automatically restart the worker process.
|
|
111
|
+
|
|
112
|
+
### Memory Sampling
|
|
113
|
+
|
|
114
|
+
When a memory leak is detected and `memory_sample` is configured, the monitor requests a lightweight memory sample from the worker. This sample:
|
|
115
|
+
|
|
116
|
+
- Tracks allocations during the sampling period.
|
|
117
|
+
- Forces a garbage collection.
|
|
118
|
+
- Returns a JSON report showing retained objects.
|
|
119
|
+
|
|
120
|
+
The report includes:
|
|
121
|
+
- `total_allocated`: Total allocated memory and object count.
|
|
122
|
+
- `total_retained`: Total retained memory and count after GC.
|
|
123
|
+
- `by_gem`: Breakdown by gem/library.
|
|
124
|
+
- `by_file`: Breakdown by source file.
|
|
125
|
+
- `by_location`: Breakdown by specific file:line locations.
|
|
126
|
+
- `by_class`: Breakdown by object class.
|
|
127
|
+
- `strings`: String allocation analysis.
|
|
128
|
+
|
|
129
|
+
This is much more efficient than a full heap dump using `ObjectSpace.dump_all`.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Process Monitor
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the {ruby Async::Service::Supervisor::ProcessMonitor} to log CPU and memory metrics for your worker processes.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Understanding how your workers consume resources over time is essential for performance optimization, capacity planning, and debugging. Without visibility into CPU and memory usage, you can't identify bottlenecks, plan infrastructure scaling, or diagnose production issues effectively.
|
|
8
|
+
|
|
9
|
+
The `ProcessMonitor` provides this observability by periodically capturing and logging comprehensive metrics for your entire application process tree.
|
|
10
|
+
|
|
11
|
+
Use the `ProcessMonitor` when you need:
|
|
12
|
+
|
|
13
|
+
- **Performance analysis**: Identify which workers consume the most CPU or memory.
|
|
14
|
+
- **Capacity planning**: Determine optimal worker counts and memory requirements.
|
|
15
|
+
- **Trend monitoring**: Track resource usage patterns over time.
|
|
16
|
+
- **Debugging assistance**: Correlate resource usage with application behavior.
|
|
17
|
+
- **Cost optimization**: Right-size infrastructure based on actual usage.
|
|
18
|
+
|
|
19
|
+
Unlike the {ruby Async::Service::Supervisor::MemoryMonitor}, which takes action when limits are exceeded, the `ProcessMonitor` is purely observational - it logs metrics without interfering with worker processes.
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Add a process monitor to log resource usage every minute:
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
service "supervisor" do
|
|
27
|
+
include Async::Service::Supervisor::Environment
|
|
28
|
+
|
|
29
|
+
monitors do
|
|
30
|
+
[
|
|
31
|
+
# Log CPU and memory metrics for all processes:
|
|
32
|
+
Async::Service::Supervisor::ProcessMonitor.new(
|
|
33
|
+
interval: 60 # Capture metrics every minute
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
This allows you to easily search and filter by specific fields:
|
|
41
|
+
- `general.process_id = 12347` - Find metrics for a specific process.
|
|
42
|
+
- `general.command = "worker-1"` - Find all metrics for worker processes.
|
|
43
|
+
- `general.processor_utilization > 50` - Find high CPU usage processes.
|
|
44
|
+
- `general.resident_size > 500000` - Find processes using more than 500MB.
|
|
45
|
+
|
|
46
|
+
## Configuration Options
|
|
47
|
+
|
|
48
|
+
### `interval`
|
|
49
|
+
|
|
50
|
+
The interval (in seconds) at which to capture and log process metrics. Default: `60` seconds.
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
# Log every 30 seconds
|
|
54
|
+
Async::Service::Supervisor::ProcessMonitor.new(interval: 30)
|
|
55
|
+
|
|
56
|
+
# Log every 5 minutes
|
|
57
|
+
Async::Service::Supervisor::ProcessMonitor.new(interval: 300)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Captured Metrics
|
|
61
|
+
|
|
62
|
+
The `ProcessMonitor` captures the following metrics for each process:
|
|
63
|
+
|
|
64
|
+
### Core Metrics
|
|
65
|
+
|
|
66
|
+
- **process_id**: Unique identifier for the process.
|
|
67
|
+
- **parent_process_id**: The parent process that spawned this one.
|
|
68
|
+
- **process_group_id**: Process group identifier.
|
|
69
|
+
- **command**: The command name.
|
|
70
|
+
- **processor_utilization**: CPU usage percentage.
|
|
71
|
+
- **resident_size**: Physical memory used (KB).
|
|
72
|
+
- **total_size**: Total memory space including shared memory (KB).
|
|
73
|
+
- **processor_time**: Total CPU time used (seconds).
|
|
74
|
+
- **elapsed_time**: How long the process has been running (seconds).
|
|
75
|
+
|
|
76
|
+
### Detailed Memory Metrics
|
|
77
|
+
|
|
78
|
+
When available (OS-dependent), additional memory details are captured:
|
|
79
|
+
|
|
80
|
+
- **map_count**: Number of memory mappings (stacks, libraries, etc.).
|
|
81
|
+
- **proportional_size**: Memory usage accounting for shared memory (KB).
|
|
82
|
+
- **shared_clean_size**: Unmodified shared memory (KB).
|
|
83
|
+
- **shared_dirty_size**: Modified shared memory (KB).
|
|
84
|
+
- **private_clean_size**: Unmodified private memory (KB).
|
|
85
|
+
- **private_dirty_size**: Modified private memory (KB).
|
|
86
|
+
- **referenced_size**: Active page-cache (KB).
|
|
87
|
+
- **anonymous_size**: Memory not backed by files (KB)
|
|
88
|
+
- **swap_size**: Memory swapped to disk (KB).
|
|
89
|
+
- **proportional_swap_size**: Proportional swap usage (KB).
|
|
90
|
+
- **major_faults**: The number of page faults requiring I/O.
|
|
91
|
+
- **minor_faults**: The number of page faults that don't require I/O (e.g. CoW).
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Released under the MIT License.
|
|
4
|
+
# Copyright, 2025, by Samuel Williams.
|
|
5
|
+
|
|
6
|
+
require "async/bus/client"
|
|
7
|
+
|
|
8
|
+
module Async
|
|
9
|
+
module Service
|
|
10
|
+
module Supervisor
|
|
11
|
+
# A client provides a mechanism to connect to a supervisor server in order to execute operations.
|
|
12
|
+
class Client < Async::Bus::Client
|
|
13
|
+
# Initialize a new client.
|
|
14
|
+
#
|
|
15
|
+
# @parameter endpoint [IO::Endpoint] The supervisor endpoint to connect to.
|
|
16
|
+
def initialize(endpoint: Supervisor.endpoint, **options)
|
|
17
|
+
super(endpoint, **options)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Released under the MIT License.
|
|
4
|
+
# Copyright, 2025, by Samuel Williams.
|
|
5
|
+
|
|
6
|
+
require "io/endpoint/unix_endpoint"
|
|
7
|
+
|
|
8
|
+
module Async
|
|
9
|
+
module Service
|
|
10
|
+
module Supervisor
|
|
11
|
+
# Get the supervisor IPC endpoint.
|
|
12
|
+
#
|
|
13
|
+
# @parameter path [String] The path for the Unix socket (default: "supervisor.ipc").
|
|
14
|
+
# @returns [IO::Endpoint] The Unix socket endpoint.
|
|
15
|
+
def self.endpoint(path = "supervisor.ipc")
|
|
16
|
+
::IO::Endpoint.unix(path)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Released under the MIT License.
|
|
4
|
+
# Copyright, 2025, by Samuel Williams.
|
|
5
|
+
|
|
6
|
+
require "async/service/environment"
|
|
7
|
+
require "async/service/managed/environment"
|
|
8
|
+
|
|
9
|
+
require_relative "service"
|
|
10
|
+
|
|
11
|
+
module Async
|
|
12
|
+
module Service
|
|
13
|
+
module Supervisor
|
|
14
|
+
# An environment mixin for supervisor services.
|
|
15
|
+
#
|
|
16
|
+
# Provides configuration and setup for supervisor processes that monitor workers.
|
|
17
|
+
module Environment
|
|
18
|
+
include Async::Service::Managed::Environment
|
|
19
|
+
|
|
20
|
+
# The service class to use for the supervisor.
|
|
21
|
+
# @returns [Class]
|
|
22
|
+
def service_class
|
|
23
|
+
Supervisor::Service
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# The name of the supervisor
|
|
27
|
+
# @returns [String]
|
|
28
|
+
def name
|
|
29
|
+
"supervisor"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# The IPC path to use for communication with the supervisor.
|
|
33
|
+
# @returns [String]
|
|
34
|
+
def ipc_path
|
|
35
|
+
::File.expand_path("supervisor.ipc", root)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# The endpoint the supervisor will bind to.
|
|
39
|
+
# @returns [::IO::Endpoint::Generic]
|
|
40
|
+
def endpoint
|
|
41
|
+
::IO::Endpoint.unix(ipc_path)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Number of supervisor instances (always 1).
|
|
45
|
+
# @returns [Integer]
|
|
46
|
+
def count
|
|
47
|
+
1
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Options to use when creating the container.
|
|
51
|
+
# Merges with Managed::Environment defaults.
|
|
52
|
+
def container_options
|
|
53
|
+
super.merge(restart: true, count: self.count)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Get the list of monitors to run in the supervisor.
|
|
57
|
+
#
|
|
58
|
+
# Override this method to provide custom monitors.
|
|
59
|
+
#
|
|
60
|
+
# @returns [Array] The list of monitor instances.
|
|
61
|
+
def monitors
|
|
62
|
+
[]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Create the supervisor server instance.
|
|
66
|
+
#
|
|
67
|
+
# @returns [Server] The supervisor server.
|
|
68
|
+
def make_server(endpoint)
|
|
69
|
+
Server.new(endpoint: endpoint, monitors: self.monitors)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Async
|
|
4
|
+
module Service
|
|
5
|
+
module Supervisor
|
|
6
|
+
# A helper for running loops at aligned intervals.
|
|
7
|
+
module Loop
|
|
8
|
+
# A robust loop that executes a block at aligned intervals.
|
|
9
|
+
#
|
|
10
|
+
# The alignment is modulo the current clock in seconds.
|
|
11
|
+
#
|
|
12
|
+
# If an error occurs during the execution of the block, it is logged and the loop continues.
|
|
13
|
+
#
|
|
14
|
+
# @parameter interval [Integer] The interval in seconds between executions of the block.
|
|
15
|
+
def self.run(interval: 60, &block)
|
|
16
|
+
while true
|
|
17
|
+
# Compute the wait time to the next interval:
|
|
18
|
+
wait = interval - (Time.now.to_f % interval)
|
|
19
|
+
if wait.positive?
|
|
20
|
+
# Sleep until the next interval boundary:
|
|
21
|
+
sleep(wait)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
begin
|
|
25
|
+
yield
|
|
26
|
+
rescue => error
|
|
27
|
+
Console.error(self, "Loop error:", error)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private_constant :Loop
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|