async-container-supervisor 0.6.4 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/bake/async/container/supervisor.rb +19 -0
- data/context/getting-started.md +51 -13
- data/context/index.yaml +8 -0
- data/context/memory-monitor.md +129 -0
- data/context/process-monitor.md +91 -0
- data/lib/async/container/supervisor/client.rb +3 -0
- data/lib/async/container/supervisor/connection.rb +117 -0
- data/lib/async/container/supervisor/dispatchable.rb +8 -0
- data/lib/async/container/supervisor/endpoint.rb +4 -0
- data/lib/async/container/supervisor/environment.rb +11 -0
- data/lib/async/container/supervisor/memory_monitor.rb +25 -1
- data/lib/async/container/supervisor/process_monitor.rb +89 -0
- data/lib/async/container/supervisor/server.rb +78 -1
- data/lib/async/container/supervisor/service.rb +11 -0
- data/lib/async/container/supervisor/supervised.rb +7 -0
- data/lib/async/container/supervisor/version.rb +4 -1
- data/lib/async/container/supervisor/worker.rb +86 -2
- data/lib/async/container/supervisor.rb +1 -10
- data/readme.md +13 -0
- data/releases.md +9 -0
- data.tar.gz.sig +0 -0
- metadata +32 -1
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 39abccaf400a7b793d8f0094e32ccee9a4a9fdad6c6f570e361cace376ebd611
|
|
4
|
+
data.tar.gz: 2f135ee3b0979a16a899a07c760e8aeb46f2474635f9f2862c4eef43b7744961
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ffe7ddc8855501a0c30e35e925596a2aaf262d34a373608e16882295603bb1137f30be80f533adcfc480c957208c96e42907006d0564270bf209763b7d3d81a5
|
|
7
|
+
data.tar.gz: 7b71f2cdcf3f75973fffdaf676f430e34270a6b1be3b684439e50f202854bffddf9ca8d4983a441d9e4fb01319414e80343e9f2a78e4698e964b01f94c71c58e
|
checksums.yaml.gz.sig
CHANGED
|
Binary file
|
|
@@ -29,6 +29,25 @@ def status
|
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
+
# Sample memory allocations from a worker over a time period.
|
|
33
|
+
#
|
|
34
|
+
# This is useful for identifying memory leaks by tracking allocations
|
|
35
|
+
# that are retained after garbage collection.
|
|
36
|
+
#
|
|
37
|
+
# @parameter duration [Integer] The duration in seconds to sample for (default: 10).
|
|
38
|
+
# @parameter connection_id [String] The connection ID to target a specific worker.
|
|
39
|
+
def memory_sample(duration: 10, connection_id:)
|
|
40
|
+
client do |connection|
|
|
41
|
+
Console.info(self, "Sampling memory from worker...", duration: duration, connection_id: connection_id)
|
|
42
|
+
|
|
43
|
+
# Build the operation request:
|
|
44
|
+
operation = {do: :memory_sample, duration: duration}
|
|
45
|
+
|
|
46
|
+
# Use the forward operation to proxy the request to a worker:
|
|
47
|
+
return connection.call(do: :forward, operation: operation, connection_id: connection_id)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
32
51
|
private
|
|
33
52
|
|
|
34
53
|
def endpoint
|
data/context/getting-started.md
CHANGED
|
@@ -35,12 +35,6 @@ graph TD
|
|
|
35
35
|
Worker1 -.->|connects via IPC| Supervisor
|
|
36
36
|
Worker2 -.->|connects via IPC| Supervisor
|
|
37
37
|
WorkerN -.->|connects via IPC| Supervisor
|
|
38
|
-
|
|
39
|
-
style Controller fill:#e1f5ff
|
|
40
|
-
style Supervisor fill:#fff4e1
|
|
41
|
-
style Worker1 fill:#e8f5e9
|
|
42
|
-
style Worker2 fill:#e8f5e9
|
|
43
|
-
style WorkerN fill:#e8f5e9
|
|
44
38
|
```
|
|
45
39
|
|
|
46
40
|
**Important:** The supervisor process is itself just another process managed by the root controller. If the supervisor crashes, the controller will restart it, and all worker processes will automatically reconnect to the new supervisor. This design ensures high availability and fault tolerance.
|
|
@@ -115,7 +109,13 @@ This will start:
|
|
|
115
109
|
|
|
116
110
|
### Adding Health Monitors
|
|
117
111
|
|
|
118
|
-
You can add monitors to
|
|
112
|
+
You can add monitors to observe worker health and automatically respond to issues. Monitors are useful for:
|
|
113
|
+
|
|
114
|
+
- **Memory leak detection**: Automatically restart workers consuming excessive memory.
|
|
115
|
+
- **Performance monitoring**: Track CPU and memory usage trends.
|
|
116
|
+
- **Capacity planning**: Understand resource requirements.
|
|
117
|
+
|
|
118
|
+
For example, to add monitoring:
|
|
119
119
|
|
|
120
120
|
```ruby
|
|
121
121
|
service "supervisor" do
|
|
@@ -123,29 +123,67 @@ service "supervisor" do
|
|
|
123
123
|
|
|
124
124
|
monitors do
|
|
125
125
|
[
|
|
126
|
-
#
|
|
126
|
+
# Log process metrics for observability:
|
|
127
|
+
Async::Container::Supervisor::ProcessMonitor.new(
|
|
128
|
+
interval: 60
|
|
129
|
+
),
|
|
130
|
+
|
|
131
|
+
# Restart workers exceeding memory limits:
|
|
127
132
|
Async::Container::Supervisor::MemoryMonitor.new(
|
|
128
|
-
interval: 10,
|
|
129
|
-
|
|
133
|
+
interval: 10,
|
|
134
|
+
maximum_size_limit: 1024 * 1024 * 500 # 500MB limit per process
|
|
130
135
|
)
|
|
131
136
|
]
|
|
132
137
|
end
|
|
133
138
|
end
|
|
134
139
|
```
|
|
135
140
|
|
|
136
|
-
|
|
141
|
+
See the {ruby Async::Container::Supervisor::MemoryMonitor Memory Monitor} and {ruby Async::Container::Supervisor::ProcessMonitor Process Monitor} guides for detailed configuration options and best practices.
|
|
137
142
|
|
|
138
143
|
### Collecting Diagnostics
|
|
139
144
|
|
|
140
145
|
The supervisor can collect various diagnostics from workers on demand:
|
|
141
146
|
|
|
142
|
-
- **Memory dumps**: Full heap dumps for memory analysis
|
|
143
|
-
- **
|
|
147
|
+
- **Memory dumps**: Full heap dumps for memory analysis via `ObjectSpace.dump_all`.
|
|
148
|
+
- **Memory samples**: Lightweight sampling to identify memory leaks.
|
|
149
|
+
- **Thread dumps**: Stack traces of all threads.
|
|
144
150
|
- **Scheduler dumps**: Async fiber hierarchy
|
|
145
151
|
- **Garbage collection profiles**: GC performance data
|
|
146
152
|
|
|
147
153
|
These can be triggered programmatically or via command-line tools (when available).
|
|
148
154
|
|
|
155
|
+
#### Memory Leak Diagnosis
|
|
156
|
+
|
|
157
|
+
To identify memory leaks, you can use the memory sampling feature which is much lighter weight than a full memory dump. It tracks allocations over a time period and focuses on retained objects.
|
|
158
|
+
|
|
159
|
+
**Using the bake task:**
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Sample for 30 seconds and print report to console
|
|
163
|
+
$ bake async:container:supervisor:memory_sample duration=30
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Programmatically:**
|
|
167
|
+
|
|
168
|
+
```ruby
|
|
169
|
+
# Assuming you have a connection to a worker:
|
|
170
|
+
result = connection.call(do: :memory_sample, duration: 30)
|
|
171
|
+
puts result[:data]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
This will sample memory allocations for the specified duration, then force a garbage collection and return a JSON report showing what objects were allocated during that period and retained after GC. Late-lifecycle allocations that are retained are likely memory leaks.
|
|
175
|
+
|
|
176
|
+
The JSON report includes:
|
|
177
|
+
- `total_allocated`: Total allocated memory and count
|
|
178
|
+
- `total_retained`: Total retained memory and count
|
|
179
|
+
- `by_gem`: Breakdown by gem/library
|
|
180
|
+
- `by_file`: Breakdown by source file
|
|
181
|
+
- `by_location`: Breakdown by specific file:line locations
|
|
182
|
+
- `by_class`: Breakdown by object class
|
|
183
|
+
- `strings`: String allocation analysis
|
|
184
|
+
|
|
185
|
+
This is much more efficient than `do: :memory_dump` which uses `ObjectSpace.dump_all` and can be slow and blocking on large heaps. The JSON format also makes it easy to integrate with monitoring and analysis tools.
|
|
186
|
+
|
|
149
187
|
## Advanced Usage
|
|
150
188
|
|
|
151
189
|
### Custom Monitors
|
data/context/index.yaml
CHANGED
|
@@ -10,3 +10,11 @@ files:
|
|
|
10
10
|
title: Getting Started
|
|
11
11
|
description: This guide explains how to get started with `async-container-supervisor`
|
|
12
12
|
to supervise and monitor worker processes in your Ruby applications.
|
|
13
|
+
- path: memory-monitor.md
|
|
14
|
+
title: Memory Monitor
|
|
15
|
+
description: This guide explains how to use the <code class="language-ruby">Async::Container::Supervisor::MemoryMonitor</code>
|
|
16
|
+
to detect and restart workers that exceed memory limits or develop memory leaks.
|
|
17
|
+
- path: process-monitor.md
|
|
18
|
+
title: Process Monitor
|
|
19
|
+
description: This guide explains how to use the <code class="language-ruby">Async::Container::Supervisor::ProcessMonitor</code>
|
|
20
|
+
to log CPU and memory metrics for your worker processes.
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Memory Monitor
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the {ruby Async::Container::Supervisor::MemoryMonitor} to detect and restart workers that exceed memory limits or develop memory leaks.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Long-running worker processes often accumulate memory over time, either through legitimate growth or memory leaks. Without intervention, workers can consume all available system memory, causing performance degradation or system crashes. The `MemoryMonitor` solves this by automatically detecting and restarting problematic workers before they impact system stability.
|
|
8
|
+
|
|
9
|
+
Use the `MemoryMonitor` when you need:
|
|
10
|
+
|
|
11
|
+
- **Memory leak protection**: Automatically restart workers that continuously accumulate memory.
|
|
12
|
+
- **Resource limits**: Enforce maximum memory usage per worker.
|
|
13
|
+
- **System stability**: Prevent runaway processes from exhausting system memory.
|
|
14
|
+
- **Leak diagnosis**: Capture memory samples when leaks are detected for debugging.
|
|
15
|
+
|
|
16
|
+
The monitor uses the `memory-leak` gem to track process memory usage over time, detecting abnormal growth patterns that indicate leaks.
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
Add a memory monitor to your supervisor service to automatically restart workers that exceed 500MB:
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
service "supervisor" do
|
|
24
|
+
include Async::Container::Supervisor::Environment
|
|
25
|
+
|
|
26
|
+
monitors do
|
|
27
|
+
[
|
|
28
|
+
Async::Container::Supervisor::MemoryMonitor.new(
|
|
29
|
+
# Check worker memory every 10 seconds:
|
|
30
|
+
interval: 10,
|
|
31
|
+
|
|
32
|
+
# Restart workers exceeding 500MB:
|
|
33
|
+
maximum_size_limit: 1024 * 1024 * 500
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
When a worker exceeds the limit:
|
|
41
|
+
1. The monitor logs the leak detection.
|
|
42
|
+
2. Optionally captures a memory sample for debugging.
|
|
43
|
+
3. Sends `SIGINT` to gracefully shut down the worker.
|
|
44
|
+
4. The container automatically spawns a replacement worker.
|
|
45
|
+
|
|
46
|
+
## Configuration Options
|
|
47
|
+
|
|
48
|
+
The `MemoryMonitor` accepts the following options:
|
|
49
|
+
|
|
50
|
+
### `interval`
|
|
51
|
+
|
|
52
|
+
The interval (in seconds) at which to check for memory leaks. Default: `10` seconds.
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
Async::Container::Supervisor::MemoryMonitor.new(interval: 30)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### `maximum_size_limit`
|
|
59
|
+
|
|
60
|
+
The maximum memory size (in bytes) per process. When a process exceeds this limit, it will be restarted.
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
# 500MB limit
|
|
64
|
+
Async::Container::Supervisor::MemoryMonitor.new(maximum_size_limit: 1024 * 1024 * 500)
|
|
65
|
+
|
|
66
|
+
# 1GB limit
|
|
67
|
+
Async::Container::Supervisor::MemoryMonitor.new(maximum_size_limit: 1024 * 1024 * 1024)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### `total_size_limit`
|
|
71
|
+
|
|
72
|
+
The total size limit (in bytes) for all monitored processes combined. If not specified, only per-process limits are enforced.
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
# Total limit of 2GB across all workers
|
|
76
|
+
Async::Container::Supervisor::MemoryMonitor.new(
|
|
77
|
+
maximum_size_limit: 1024 * 1024 * 500, # 500MB per process
|
|
78
|
+
total_size_limit: 1024 * 1024 * 1024 * 2 # 2GB total
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### `memory_sample`
|
|
83
|
+
|
|
84
|
+
Options for capturing memory samples when a leak is detected. If `nil`, memory sampling is disabled.
|
|
85
|
+
|
|
86
|
+
Default: `{duration: 30, timeout: 120}`
|
|
87
|
+
|
|
88
|
+
```ruby
|
|
89
|
+
# Customize memory sampling:
|
|
90
|
+
Async::Container::Supervisor::MemoryMonitor.new(
|
|
91
|
+
memory_sample: {
|
|
92
|
+
duration: 60, # Sample for 60 seconds
|
|
93
|
+
timeout: 180 # Timeout after 180 seconds
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Disable memory sampling:
|
|
98
|
+
Async::Container::Supervisor::MemoryMonitor.new(
|
|
99
|
+
memory_sample: nil
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Memory Leak Detection
|
|
104
|
+
|
|
105
|
+
When a memory leak is detected, the monitor will:
|
|
106
|
+
|
|
107
|
+
1. Log the leak detection with process details.
|
|
108
|
+
2. If `memory_sample` is configured, capture a memory sample from the worker.
|
|
109
|
+
3. Send a `SIGINT` signal to gracefully restart the worker.
|
|
110
|
+
4. The container will automatically restart the worker process.
|
|
111
|
+
|
|
112
|
+
### Memory Sampling
|
|
113
|
+
|
|
114
|
+
When a memory leak is detected and `memory_sample` is configured, the monitor requests a lightweight memory sample from the worker. This sample:
|
|
115
|
+
|
|
116
|
+
- Tracks allocations during the sampling period.
|
|
117
|
+
- Forces a garbage collection.
|
|
118
|
+
- Returns a JSON report showing retained objects.
|
|
119
|
+
|
|
120
|
+
The report includes:
|
|
121
|
+
- `total_allocated`: Total allocated memory and object count.
|
|
122
|
+
- `total_retained`: Total retained memory and count after GC.
|
|
123
|
+
- `by_gem`: Breakdown by gem/library.
|
|
124
|
+
- `by_file`: Breakdown by source file.
|
|
125
|
+
- `by_location`: Breakdown by specific file:line locations.
|
|
126
|
+
- `by_class`: Breakdown by object class.
|
|
127
|
+
- `strings`: String allocation analysis.
|
|
128
|
+
|
|
129
|
+
This is much more efficient than a full heap dump using `ObjectSpace.dump_all`.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Process Monitor
|
|
2
|
+
|
|
3
|
+
This guide explains how to use the {ruby Async::Container::Supervisor::ProcessMonitor} to log CPU and memory metrics for your worker processes.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Understanding how your workers consume resources over time is essential for performance optimization, capacity planning, and debugging. Without visibility into CPU and memory usage, you can't identify bottlenecks, plan infrastructure scaling, or diagnose production issues effectively.
|
|
8
|
+
|
|
9
|
+
The `ProcessMonitor` provides this observability by periodically capturing and logging comprehensive metrics for your entire application process tree.
|
|
10
|
+
|
|
11
|
+
Use the `ProcessMonitor` when you need:
|
|
12
|
+
|
|
13
|
+
- **Performance analysis**: Identify which workers consume the most CPU or memory.
|
|
14
|
+
- **Capacity planning**: Determine optimal worker counts and memory requirements.
|
|
15
|
+
- **Trend monitoring**: Track resource usage patterns over time.
|
|
16
|
+
- **Debugging assistance**: Correlate resource usage with application behavior.
|
|
17
|
+
- **Cost optimization**: Right-size infrastructure based on actual usage.
|
|
18
|
+
|
|
19
|
+
Unlike the {ruby Async::Container::Supervisor::MemoryMonitor}, which takes action when limits are exceeded, the `ProcessMonitor` is purely observational - it logs metrics without interfering with worker processes.
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Add a process monitor to log resource usage every minute:
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
service "supervisor" do
|
|
27
|
+
include Async::Container::Supervisor::Environment
|
|
28
|
+
|
|
29
|
+
monitors do
|
|
30
|
+
[
|
|
31
|
+
# Log CPU and memory metrics for all processes:
|
|
32
|
+
Async::Container::Supervisor::ProcessMonitor.new(
|
|
33
|
+
interval: 60 # Capture metrics every minute
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
This allows you to easily search and filter by specific fields:
|
|
41
|
+
- `general.process_id = 12347` - Find metrics for a specific process.
|
|
42
|
+
- `general.command = "worker-1"` - Find all metrics for worker processes.
|
|
43
|
+
- `general.processor_utilization > 50` - Find high CPU usage processes.
|
|
44
|
+
- `general.resident_size > 500000` - Find processes using more than 500MB.
|
|
45
|
+
|
|
46
|
+
## Configuration Options
|
|
47
|
+
|
|
48
|
+
### `interval`
|
|
49
|
+
|
|
50
|
+
The interval (in seconds) at which to capture and log process metrics. Default: `60` seconds.
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
# Log every 30 seconds
|
|
54
|
+
Async::Container::Supervisor::ProcessMonitor.new(interval: 30)
|
|
55
|
+
|
|
56
|
+
# Log every 5 minutes
|
|
57
|
+
Async::Container::Supervisor::ProcessMonitor.new(interval: 300)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Captured Metrics
|
|
61
|
+
|
|
62
|
+
The `ProcessMonitor` captures the following metrics for each process:
|
|
63
|
+
|
|
64
|
+
### Core Metrics
|
|
65
|
+
|
|
66
|
+
- **process_id**: Unique identifier for the process.
|
|
67
|
+
- **parent_process_id**: The parent process that spawned this one.
|
|
68
|
+
- **process_group_id**: Process group identifier.
|
|
69
|
+
- **command**: The command name.
|
|
70
|
+
- **processor_utilization**: CPU usage percentage.
|
|
71
|
+
- **resident_size**: Physical memory used (KB).
|
|
72
|
+
- **total_size**: Total memory space including shared memory (KB).
|
|
73
|
+
- **processor_time**: Total CPU time used (seconds).
|
|
74
|
+
- **elapsed_time**: How long the process has been running (seconds).
|
|
75
|
+
|
|
76
|
+
### Detailed Memory Metrics
|
|
77
|
+
|
|
78
|
+
When available (OS-dependent), additional memory details are captured:
|
|
79
|
+
|
|
80
|
+
- **map_count**: Number of memory mappings (stacks, libraries, etc.).
|
|
81
|
+
- **proportional_size**: Memory usage accounting for shared memory (KB).
|
|
82
|
+
- **shared_clean_size**: Unmodified shared memory (KB).
|
|
83
|
+
- **shared_dirty_size**: Modified shared memory (KB).
|
|
84
|
+
- **private_clean_size**: Unmodified private memory (KB).
|
|
85
|
+
- **private_dirty_size**: Modified private memory (KB).
|
|
86
|
+
- **referenced_size**: Active page-cache (KB).
|
|
87
|
+
- **anonymous_size**: Memory not backed by files (KB)
|
|
88
|
+
- **swap_size**: Memory swapped to disk (KB).
|
|
89
|
+
- **proportional_swap_size**: Proportional swap usage (KB).
|
|
90
|
+
- **major_faults**: The number of page faults requiring I/O.
|
|
91
|
+
- **minor_faults**: The number of page faults that don't require I/O (e.g. CoW).
|
|
@@ -11,6 +11,9 @@ module Async
|
|
|
11
11
|
module Supervisor
|
|
12
12
|
# A client provides a mechanism to connect to a supervisor server in order to execute operations.
|
|
13
13
|
class Client
|
|
14
|
+
# Initialize a new client.
|
|
15
|
+
#
|
|
16
|
+
# @parameter endpoint [IO::Endpoint] The supervisor endpoint to connect to.
|
|
14
17
|
def initialize(endpoint: Supervisor.endpoint)
|
|
15
18
|
@endpoint = endpoint
|
|
16
19
|
end
|
|
@@ -8,8 +8,19 @@ require "json"
|
|
|
8
8
|
module Async
|
|
9
9
|
module Container
|
|
10
10
|
module Supervisor
|
|
11
|
+
# Represents a bidirectional communication channel between supervisor and worker.
|
|
12
|
+
#
|
|
13
|
+
# Handles message passing, call/response patterns, and connection lifecycle.
|
|
11
14
|
class Connection
|
|
15
|
+
# Represents a remote procedure call over a connection.
|
|
16
|
+
#
|
|
17
|
+
# Manages the call lifecycle, response queueing, and completion signaling.
|
|
12
18
|
class Call
|
|
19
|
+
# Initialize a new call.
|
|
20
|
+
#
|
|
21
|
+
# @parameter connection [Connection] The connection this call belongs to.
|
|
22
|
+
# @parameter id [Integer] The unique call identifier.
|
|
23
|
+
# @parameter message [Hash] The call message/parameters.
|
|
13
24
|
def initialize(connection, id, message)
|
|
14
25
|
@connection = connection
|
|
15
26
|
@id = id
|
|
@@ -18,10 +29,16 @@ module Async
|
|
|
18
29
|
@queue = ::Thread::Queue.new
|
|
19
30
|
end
|
|
20
31
|
|
|
32
|
+
# Convert the call to a JSON-compatible hash.
|
|
33
|
+
#
|
|
34
|
+
# @returns [Hash] The message hash.
|
|
21
35
|
def as_json(...)
|
|
22
36
|
@message
|
|
23
37
|
end
|
|
24
38
|
|
|
39
|
+
# Convert the call to a JSON string.
|
|
40
|
+
#
|
|
41
|
+
# @returns [String] The JSON representation.
|
|
25
42
|
def to_json(...)
|
|
26
43
|
as_json.to_json(...)
|
|
27
44
|
end
|
|
@@ -32,14 +49,24 @@ module Async
|
|
|
32
49
|
# @attribute [Hash] The message that initiated the call.
|
|
33
50
|
attr :message
|
|
34
51
|
|
|
52
|
+
# Access a parameter from the call message.
|
|
53
|
+
#
|
|
54
|
+
# @parameter key [Symbol] The parameter name.
|
|
55
|
+
# @returns [Object] The parameter value.
|
|
35
56
|
def [] key
|
|
36
57
|
@message[key]
|
|
37
58
|
end
|
|
38
59
|
|
|
60
|
+
# Push a response into the call's queue.
|
|
61
|
+
#
|
|
62
|
+
# @parameter response [Hash] The response data to push.
|
|
39
63
|
def push(**response)
|
|
40
64
|
@queue.push(response)
|
|
41
65
|
end
|
|
42
66
|
|
|
67
|
+
# Pop a response from the call's queue.
|
|
68
|
+
#
|
|
69
|
+
# @returns [Hash, nil] The next response or nil if queue is closed.
|
|
43
70
|
def pop(...)
|
|
44
71
|
@queue.pop(...)
|
|
45
72
|
end
|
|
@@ -49,12 +76,20 @@ module Async
|
|
|
49
76
|
@queue.close
|
|
50
77
|
end
|
|
51
78
|
|
|
79
|
+
# Iterate over all responses from the call.
|
|
80
|
+
#
|
|
81
|
+
# @yields {|response| ...} Each response from the queue.
|
|
52
82
|
def each(&block)
|
|
53
83
|
while response = self.pop
|
|
54
84
|
yield response
|
|
55
85
|
end
|
|
56
86
|
end
|
|
57
87
|
|
|
88
|
+
# Finish the call with a final response.
|
|
89
|
+
#
|
|
90
|
+
# Closes the response queue after pushing the final response.
|
|
91
|
+
#
|
|
92
|
+
# @parameter response [Hash] The final response data.
|
|
58
93
|
def finish(**response)
|
|
59
94
|
# If the remote end has already closed the connection, we don't need to send a finished message:
|
|
60
95
|
unless @queue.closed?
|
|
@@ -63,14 +98,51 @@ module Async
|
|
|
63
98
|
end
|
|
64
99
|
end
|
|
65
100
|
|
|
101
|
+
# Finish the call with a failure response.
|
|
102
|
+
#
|
|
103
|
+
# @parameter response [Hash] The error response data.
|
|
66
104
|
def fail(**response)
|
|
67
105
|
self.finish(failed: true, **response)
|
|
68
106
|
end
|
|
69
107
|
|
|
108
|
+
# Check if the call's queue is closed.
|
|
109
|
+
#
|
|
110
|
+
# @returns [Boolean] True if the queue is closed.
|
|
70
111
|
def closed?
|
|
71
112
|
@queue.closed?
|
|
72
113
|
end
|
|
73
114
|
|
|
115
|
+
# Forward this call to another connection, proxying all responses back.
|
|
116
|
+
#
|
|
117
|
+
# This provides true streaming forwarding - intermediate responses flow through
|
|
118
|
+
# in real-time rather than being buffered. The forwarding runs asynchronously
|
|
119
|
+
# to avoid blocking the dispatcher.
|
|
120
|
+
#
|
|
121
|
+
# @parameter target_connection [Connection] The connection to forward the call to.
|
|
122
|
+
# @parameter operation [Hash] The operation request to forward (must include :do key).
|
|
123
|
+
def forward(target_connection, operation)
|
|
124
|
+
# Forward the operation in an async task to avoid blocking
|
|
125
|
+
Async do
|
|
126
|
+
# Make the call to the target connection and stream responses back:
|
|
127
|
+
Call.call(target_connection, **operation) do |response|
|
|
128
|
+
# Push each response through our queue:
|
|
129
|
+
self.push(**response)
|
|
130
|
+
end
|
|
131
|
+
ensure
|
|
132
|
+
# Close our queue to signal completion:
|
|
133
|
+
@queue.close
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Dispatch a call to a target handler.
|
|
138
|
+
#
|
|
139
|
+
# Creates a call, dispatches it to the target, and streams responses back
|
|
140
|
+
# through the connection.
|
|
141
|
+
#
|
|
142
|
+
# @parameter connection [Connection] The connection to dispatch on.
|
|
143
|
+
# @parameter target [Dispatchable] The target handler.
|
|
144
|
+
# @parameter id [Integer] The call identifier.
|
|
145
|
+
# @parameter message [Hash] The call message.
|
|
74
146
|
def self.dispatch(connection, target, id, message)
|
|
75
147
|
Async do
|
|
76
148
|
call = self.new(connection, id, message)
|
|
@@ -91,6 +163,15 @@ module Async
|
|
|
91
163
|
end
|
|
92
164
|
end
|
|
93
165
|
|
|
166
|
+
# Make a call on a connection and wait for responses.
|
|
167
|
+
#
|
|
168
|
+
# If a block is provided, yields each response. Otherwise, buffers intermediate
|
|
169
|
+
# responses and returns the final response.
|
|
170
|
+
#
|
|
171
|
+
# @parameter connection [Connection] The connection to call on.
|
|
172
|
+
# @parameter message [Hash] The call message/parameters.
|
|
173
|
+
# @yields {|response| ...} Each intermediate response if block given.
|
|
174
|
+
# @returns [Hash, Array] The final response or array of intermediate responses.
|
|
94
175
|
def self.call(connection, **message, &block)
|
|
95
176
|
id = connection.next_id
|
|
96
177
|
call = self.new(connection, id, message)
|
|
@@ -128,6 +209,11 @@ module Async
|
|
|
128
209
|
end
|
|
129
210
|
end
|
|
130
211
|
|
|
212
|
+
# Initialize a new connection.
|
|
213
|
+
#
|
|
214
|
+
# @parameter stream [IO] The underlying IO stream.
|
|
215
|
+
# @parameter id [Integer] The starting call ID (default: 0).
|
|
216
|
+
# @parameter state [Hash] Initial connection state.
|
|
131
217
|
def initialize(stream, id = 0, **state)
|
|
132
218
|
@stream = stream
|
|
133
219
|
@id = id
|
|
@@ -143,15 +229,26 @@ module Async
|
|
|
143
229
|
# @attribute [Hash(Symbol, Object)] State associated with this connection, for example the process ID, etc.
|
|
144
230
|
attr_accessor :state
|
|
145
231
|
|
|
232
|
+
# Generate the next unique call ID.
|
|
233
|
+
#
|
|
234
|
+
# @returns [Integer] The next call identifier.
|
|
146
235
|
def next_id
|
|
147
236
|
@id += 2
|
|
148
237
|
end
|
|
149
238
|
|
|
239
|
+
# Write a message to the connection stream.
|
|
240
|
+
#
|
|
241
|
+
# @parameter message [Hash] The message to write.
|
|
150
242
|
def write(**message)
|
|
151
243
|
@stream.write(JSON.dump(message) << "\n")
|
|
152
244
|
@stream.flush
|
|
153
245
|
end
|
|
154
246
|
|
|
247
|
+
# Make a synchronous call and wait for a single response.
|
|
248
|
+
#
|
|
249
|
+
# @parameter timeout [Numeric, nil] Optional timeout for the call.
|
|
250
|
+
# @parameter message [Hash] The call message.
|
|
251
|
+
# @returns [Hash] The response.
|
|
155
252
|
def call(timeout: nil, **message)
|
|
156
253
|
id = next_id
|
|
157
254
|
calls[id] = ::Thread::Queue.new
|
|
@@ -163,22 +260,34 @@ module Async
|
|
|
163
260
|
calls.delete(id)
|
|
164
261
|
end
|
|
165
262
|
|
|
263
|
+
# Read a message from the connection stream.
|
|
264
|
+
#
|
|
265
|
+
# @returns [Hash, nil] The parsed message or nil if stream is closed.
|
|
166
266
|
def read
|
|
167
267
|
if line = @stream&.gets
|
|
168
268
|
JSON.parse(line, symbolize_names: true)
|
|
169
269
|
end
|
|
170
270
|
end
|
|
171
271
|
|
|
272
|
+
# Iterate over all messages from the connection.
|
|
273
|
+
#
|
|
274
|
+
# @yields {|message| ...} Each message read from the stream.
|
|
172
275
|
def each
|
|
173
276
|
while message = self.read
|
|
174
277
|
yield message
|
|
175
278
|
end
|
|
176
279
|
end
|
|
177
280
|
|
|
281
|
+
# Make a synchronous call and wait for a single response.
|
|
178
282
|
def call(...)
|
|
179
283
|
Call.call(self, ...)
|
|
180
284
|
end
|
|
181
285
|
|
|
286
|
+
# Run the connection, processing incoming messages.
|
|
287
|
+
#
|
|
288
|
+
# Dispatches incoming calls to the target and routes responses to waiting calls.
|
|
289
|
+
#
|
|
290
|
+
# @parameter target [Dispatchable] The target to dispatch calls to.
|
|
182
291
|
def run(target)
|
|
183
292
|
self.each do |message|
|
|
184
293
|
if id = message.delete(:id)
|
|
@@ -198,12 +307,20 @@ module Async
|
|
|
198
307
|
end
|
|
199
308
|
end
|
|
200
309
|
|
|
310
|
+
# Run the connection in a background task.
|
|
311
|
+
#
|
|
312
|
+
# @parameter target [Dispatchable] The target to dispatch calls to.
|
|
313
|
+
# @parameter parent [Async::Task] The parent task.
|
|
314
|
+
# @returns [Async::Task] The background reader task.
|
|
201
315
|
def run_in_background(target, parent: Task.current)
|
|
202
316
|
@reader ||= parent.async do
|
|
203
317
|
self.run(target)
|
|
204
318
|
end
|
|
205
319
|
end
|
|
206
320
|
|
|
321
|
+
# Close the connection and clean up resources.
|
|
322
|
+
#
|
|
323
|
+
# Stops the background reader, closes the stream, and closes all pending calls.
|
|
207
324
|
def close
|
|
208
325
|
if @reader
|
|
209
326
|
@reader.stop
|
|
@@ -9,7 +9,15 @@ require_relative "endpoint"
|
|
|
9
9
|
module Async
|
|
10
10
|
module Container
|
|
11
11
|
module Supervisor
|
|
12
|
+
# A mixin for objects that can dispatch calls.
|
|
13
|
+
#
|
|
14
|
+
# Provides automatic method dispatch based on the call's `:do` parameter.
|
|
12
15
|
module Dispatchable
|
|
16
|
+
# Dispatch a call to the appropriate method.
|
|
17
|
+
#
|
|
18
|
+
# Routes calls to methods named `do_#{operation}` based on the call's `:do` parameter.
|
|
19
|
+
#
|
|
20
|
+
# @parameter call [Connection::Call] The call to dispatch.
|
|
13
21
|
def dispatch(call)
|
|
14
22
|
method_name = "do_#{call.message[:do]}"
|
|
15
23
|
self.public_send(method_name, call)
|
|
@@ -8,6 +8,10 @@ require "io/endpoint/unix_endpoint"
|
|
|
8
8
|
module Async
|
|
9
9
|
module Container
|
|
10
10
|
module Supervisor
|
|
11
|
+
# Get the supervisor IPC endpoint.
|
|
12
|
+
#
|
|
13
|
+
# @parameter path [String] The path for the Unix socket (default: "supervisor.ipc").
|
|
14
|
+
# @returns [IO::Endpoint] The Unix socket endpoint.
|
|
11
15
|
def self.endpoint(path = "supervisor.ipc")
|
|
12
16
|
::IO::Endpoint.unix(path)
|
|
13
17
|
end
|
|
@@ -10,6 +10,9 @@ require_relative "service"
|
|
|
10
10
|
module Async
|
|
11
11
|
module Container
|
|
12
12
|
module Supervisor
|
|
13
|
+
# An environment mixin for supervisor services.
|
|
14
|
+
#
|
|
15
|
+
# Provides configuration and setup for supervisor processes that monitor workers.
|
|
13
16
|
module Environment
|
|
14
17
|
# The service class to use for the supervisor.
|
|
15
18
|
# @returns [Class]
|
|
@@ -40,10 +43,18 @@ module Async
|
|
|
40
43
|
{restart: true, count: 1, health_check_timeout: 30}
|
|
41
44
|
end
|
|
42
45
|
|
|
46
|
+
# Get the list of monitors to run in the supervisor.
|
|
47
|
+
#
|
|
48
|
+
# Override this method to provide custom monitors.
|
|
49
|
+
#
|
|
50
|
+
# @returns [Array] The list of monitor instances.
|
|
43
51
|
def monitors
|
|
44
52
|
[]
|
|
45
53
|
end
|
|
46
54
|
|
|
55
|
+
# Create the supervisor server instance.
|
|
56
|
+
#
|
|
57
|
+
# @returns [Server] The supervisor server.
|
|
47
58
|
def make_server(endpoint)
|
|
48
59
|
Server.new(endpoint: endpoint, monitors: self.monitors)
|
|
49
60
|
end
|
|
@@ -9,16 +9,23 @@ require "set"
|
|
|
9
9
|
module Async
|
|
10
10
|
module Container
|
|
11
11
|
module Supervisor
|
|
12
|
+
# Monitors worker memory usage and restarts workers that exceed limits.
|
|
13
|
+
#
|
|
14
|
+
# Uses the `memory` gem to track process memory and detect leaks.
|
|
12
15
|
class MemoryMonitor
|
|
16
|
+
MEMORY_SAMPLE = {duration: 30, timeout: 30*4}
|
|
17
|
+
|
|
13
18
|
# Create a new memory monitor.
|
|
14
19
|
#
|
|
15
20
|
# @parameter interval [Integer] The interval at which to check for memory leaks.
|
|
16
21
|
# @parameter total_size_limit [Integer] The total size limit of all processes, or nil for no limit.
|
|
17
22
|
# @parameter options [Hash] Options to pass to the cluster when adding processes.
|
|
18
|
-
def initialize(interval: 10, total_size_limit: nil, **options)
|
|
23
|
+
def initialize(interval: 10, total_size_limit: nil, memory_sample: MEMORY_SAMPLE, **options)
|
|
19
24
|
@interval = interval
|
|
20
25
|
@cluster = Memory::Leak::Cluster.new(total_size_limit: total_size_limit)
|
|
21
26
|
|
|
27
|
+
@memory_sample = memory_sample
|
|
28
|
+
|
|
22
29
|
# We use these options when adding processes to the cluster:
|
|
23
30
|
@options = options
|
|
24
31
|
|
|
@@ -74,6 +81,23 @@ module Async
|
|
|
74
81
|
# @parameter monitor [Memory::Leak::Monitor] The monitor that detected the memory leak.
|
|
75
82
|
# @returns [Boolean] True if the process was killed.
|
|
76
83
|
def memory_leak_detected(process_id, monitor)
|
|
84
|
+
Console.info(self, "Memory leak detected!", child: {process_id: process_id}, monitor: monitor)
|
|
85
|
+
|
|
86
|
+
if @memory_sample
|
|
87
|
+
Console.info(self, "Capturing memory sample...", child: {process_id: process_id}, memory_sample: @memory_sample)
|
|
88
|
+
|
|
89
|
+
# We are tracking multiple connections to the same process:
|
|
90
|
+
connections = @processes[process_id]
|
|
91
|
+
|
|
92
|
+
# Try to capture a memory sample:
|
|
93
|
+
connections.each do |connection|
|
|
94
|
+
result = connection.call(do: :memory_sample, **@memory_sample)
|
|
95
|
+
|
|
96
|
+
Console.info(self, "Memory sample completed:", child: {process_id: process_id}, result: result)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Kill the process gently:
|
|
77
101
|
Console.info(self, "Killing process!", child: {process_id: process_id})
|
|
78
102
|
Process.kill(:INT, process_id)
|
|
79
103
|
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Released under the MIT License.
|
|
4
|
+
# Copyright, 2025, by Samuel Williams.
|
|
5
|
+
|
|
6
|
+
require "process/metrics"
|
|
7
|
+
|
|
8
|
+
module Async
|
|
9
|
+
module Container
|
|
10
|
+
module Supervisor
|
|
11
|
+
# Monitors process metrics and logs them periodically.
|
|
12
|
+
#
|
|
13
|
+
# Uses the `process-metrics` gem to capture CPU and memory metrics for a process tree.
|
|
14
|
+
# Unlike {MemoryMonitor}, this monitor captures metrics for the entire process tree
|
|
15
|
+
# by tracking the parent process ID (ppid), which is more efficient than tracking
|
|
16
|
+
# individual processes.
|
|
17
|
+
class ProcessMonitor
|
|
18
|
+
# Create a new process monitor.
|
|
19
|
+
#
|
|
20
|
+
# @parameter interval [Integer] The interval in seconds at which to log process metrics.
|
|
21
|
+
# @parameter ppid [Integer] The parent process ID to monitor. If nil, uses the current process to capture its children.
|
|
22
|
+
def initialize(interval: 60, ppid: nil)
|
|
23
|
+
@interval = interval
|
|
24
|
+
@ppid = ppid || Process.ppid
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @attribute [Integer] The parent process ID being monitored.
|
|
28
|
+
attr :ppid
|
|
29
|
+
|
|
30
|
+
# Register a connection with the process monitor.
|
|
31
|
+
#
|
|
32
|
+
# This is provided for consistency with {MemoryMonitor}, but since we monitor
|
|
33
|
+
# the entire process tree via ppid, we don't need to track individual connections.
|
|
34
|
+
#
|
|
35
|
+
# @parameter connection [Connection] The connection to register.
|
|
36
|
+
def register(connection)
|
|
37
|
+
Console.debug(self, "Connection registered.", connection: connection, state: connection.state)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Remove a connection from the process monitor.
|
|
41
|
+
#
|
|
42
|
+
# This is provided for consistency with {MemoryMonitor}, but since we monitor
|
|
43
|
+
# the entire process tree via ppid, we don't need to track individual connections.
|
|
44
|
+
#
|
|
45
|
+
# @parameter connection [Connection] The connection to remove.
|
|
46
|
+
def remove(connection)
|
|
47
|
+
Console.debug(self, "Connection removed.", connection: connection, state: connection.state)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Capture current process metrics for the entire process tree.
|
|
51
|
+
#
|
|
52
|
+
# @returns [Hash] A hash mapping process IDs to their metrics.
|
|
53
|
+
def metrics
|
|
54
|
+
Process::Metrics::General.capture(ppid: @ppid)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Dump the current status of the process monitor.
|
|
58
|
+
#
|
|
59
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
60
|
+
def status(call)
|
|
61
|
+
metrics = self.metrics
|
|
62
|
+
|
|
63
|
+
call.push(process_monitor: {ppid: @ppid, metrics: metrics})
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Run the process monitor.
|
|
67
|
+
#
|
|
68
|
+
# Periodically captures and logs process metrics for the entire process tree.
|
|
69
|
+
#
|
|
70
|
+
# @returns [Async::Task] The task that is running the process monitor.
|
|
71
|
+
def run
|
|
72
|
+
Async do
|
|
73
|
+
while true
|
|
74
|
+
metrics = self.metrics
|
|
75
|
+
|
|
76
|
+
# Log each process individually for better searchability in log platforms:
|
|
77
|
+
metrics.each do |process_id, general|
|
|
78
|
+
Console.info(self, "Process metrics captured.", general: general)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
sleep(@interval)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
# Released under the MIT License.
|
|
4
4
|
# Copyright, 2025, by Samuel Williams.
|
|
5
5
|
|
|
6
|
+
require "securerandom"
|
|
7
|
+
|
|
6
8
|
require_relative "connection"
|
|
7
9
|
require_relative "endpoint"
|
|
8
10
|
require_relative "dispatchable"
|
|
@@ -14,18 +16,36 @@ module Async
|
|
|
14
16
|
#
|
|
15
17
|
# There are various tasks that can be executed by the server, such as restarting the process group, and querying the status of the processes. The server is also responsible for managing the lifecycle of the monitors, which can be used to monitor the status of the connected workers.
|
|
16
18
|
class Server
|
|
19
|
+
# Initialize a new supervisor server.
|
|
20
|
+
#
|
|
21
|
+
# @parameter monitors [Array] The monitors to run.
|
|
22
|
+
# @parameter endpoint [IO::Endpoint] The endpoint to listen on.
|
|
17
23
|
def initialize(monitors: [], endpoint: Supervisor.endpoint)
|
|
18
24
|
@monitors = monitors
|
|
19
25
|
@endpoint = endpoint
|
|
26
|
+
|
|
27
|
+
@connections = {}
|
|
20
28
|
end
|
|
21
29
|
|
|
22
30
|
attr :monitors
|
|
31
|
+
attr :connections
|
|
23
32
|
|
|
24
33
|
include Dispatchable
|
|
25
34
|
|
|
35
|
+
# Register a worker connection with the supervisor.
|
|
36
|
+
#
|
|
37
|
+
# Assigns a unique connection ID and notifies all monitors of the new connection.
|
|
38
|
+
#
|
|
39
|
+
# @parameter call [Connection::Call] The registration call.
|
|
40
|
+
# @parameter call[:state] [Hash] The worker state to merge (e.g. process_id).
|
|
26
41
|
def do_register(call)
|
|
27
42
|
call.connection.state.merge!(call.message[:state])
|
|
28
43
|
|
|
44
|
+
connection_id = SecureRandom.uuid
|
|
45
|
+
call.connection.state[:connection_id] = connection_id
|
|
46
|
+
|
|
47
|
+
@connections[connection_id] = call.connection
|
|
48
|
+
|
|
29
49
|
@monitors.each do |monitor|
|
|
30
50
|
monitor.register(call.connection)
|
|
31
51
|
rescue => error
|
|
@@ -35,6 +55,35 @@ module Async
|
|
|
35
55
|
call.finish
|
|
36
56
|
end
|
|
37
57
|
|
|
58
|
+
# Forward an operation to a worker connection.
|
|
59
|
+
#
|
|
60
|
+
# This allows clients to invoke operations on specific worker processes by
|
|
61
|
+
# providing a connection_id. The operation is proxied through to the worker
|
|
62
|
+
# and responses are streamed back to the client.
|
|
63
|
+
#
|
|
64
|
+
# @parameter call [Connection::Call] The call to handle.
|
|
65
|
+
# @parameter call[:operation] [Hash] The operation to forward, must include :do key.
|
|
66
|
+
# @parameter call[:connection_id] [String] The connection ID to target.
|
|
67
|
+
def do_forward(call)
|
|
68
|
+
operation = call[:operation]
|
|
69
|
+
connection_id = call[:connection_id]
|
|
70
|
+
|
|
71
|
+
unless connection_id
|
|
72
|
+
call.fail(error: "Missing 'connection_id' parameter")
|
|
73
|
+
return
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
connection = @connections[connection_id]
|
|
77
|
+
|
|
78
|
+
unless connection
|
|
79
|
+
call.fail(error: "Connection not found", connection_id: connection_id)
|
|
80
|
+
return
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Forward the call to the target connection
|
|
84
|
+
call.forward(connection, operation)
|
|
85
|
+
end
|
|
86
|
+
|
|
38
87
|
# Restart the current process group, usually including the supervisor and any other processes.
|
|
39
88
|
#
|
|
40
89
|
# @parameter signal [Symbol] The signal to send to the process group.
|
|
@@ -47,15 +96,38 @@ module Async
|
|
|
47
96
|
::Process.kill(signal, ::Process.ppid)
|
|
48
97
|
end
|
|
49
98
|
|
|
99
|
+
# Query the status of the supervisor and all connected workers.
|
|
100
|
+
#
|
|
101
|
+
# Returns information about all registered connections and delegates to
|
|
102
|
+
# monitors to provide additional status information.
|
|
103
|
+
#
|
|
104
|
+
# @parameter call [Connection::Call] The status call.
|
|
50
105
|
def do_status(call)
|
|
106
|
+
connections = @connections.map do |connection_id, connection|
|
|
107
|
+
{
|
|
108
|
+
connection_id: connection_id,
|
|
109
|
+
process_id: connection.state[:process_id],
|
|
110
|
+
state: connection.state,
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
51
114
|
@monitors.each do |monitor|
|
|
52
115
|
monitor.status(call)
|
|
53
116
|
end
|
|
54
117
|
|
|
55
|
-
call.finish
|
|
118
|
+
call.finish(connections: connections)
|
|
56
119
|
end
|
|
57
120
|
|
|
121
|
+
# Remove a worker connection from the supervisor.
|
|
122
|
+
#
|
|
123
|
+
# Notifies all monitors and removes the connection from tracking.
|
|
124
|
+
#
|
|
125
|
+
# @parameter connection [Connection] The connection to remove.
|
|
58
126
|
def remove(connection)
|
|
127
|
+
if connection_id = connection.state[:connection_id]
|
|
128
|
+
@connections.delete(connection_id)
|
|
129
|
+
end
|
|
130
|
+
|
|
59
131
|
@monitors.each do |monitor|
|
|
60
132
|
monitor.remove(connection)
|
|
61
133
|
rescue => error
|
|
@@ -63,6 +135,11 @@ module Async
|
|
|
63
135
|
end
|
|
64
136
|
end
|
|
65
137
|
|
|
138
|
+
# Run the supervisor server.
|
|
139
|
+
#
|
|
140
|
+
# Starts all monitors and accepts connections from workers.
|
|
141
|
+
#
|
|
142
|
+
# @parameter parent [Async::Task] The parent task to run under.
|
|
66
143
|
def run(parent: Task.current)
|
|
67
144
|
parent.async do |task|
|
|
68
145
|
@monitors.each do |monitor|
|
|
@@ -10,6 +10,9 @@ require "io/endpoint/bound_endpoint"
|
|
|
10
10
|
module Async
|
|
11
11
|
module Container
|
|
12
12
|
module Supervisor
|
|
13
|
+
# The supervisor service implementation.
|
|
14
|
+
#
|
|
15
|
+
# Manages the lifecycle of the supervisor server and its monitors.
|
|
13
16
|
class Service < Async::Service::Generic
|
|
14
17
|
# Initialize the supervisor using the given environment.
|
|
15
18
|
# @parameter environment [Build::Environment]
|
|
@@ -32,10 +35,18 @@ module Async
|
|
|
32
35
|
super
|
|
33
36
|
end
|
|
34
37
|
|
|
38
|
+
# Get the name of the supervisor service.
|
|
39
|
+
#
|
|
40
|
+
# @returns [String] The service name.
|
|
35
41
|
def name
|
|
36
42
|
@evaluator.name
|
|
37
43
|
end
|
|
38
44
|
|
|
45
|
+
# Set up the supervisor service in the container.
|
|
46
|
+
#
|
|
47
|
+
# Creates and runs the supervisor server with configured monitors.
|
|
48
|
+
#
|
|
49
|
+
# @parameter container [Async::Container::Generic] The container to set up in.
|
|
39
50
|
def setup(container)
|
|
40
51
|
container_options = @evaluator.container_options
|
|
41
52
|
health_check_timeout = container_options[:health_check_timeout]
|
|
@@ -8,6 +8,9 @@ require "async/service/environment"
|
|
|
8
8
|
module Async
|
|
9
9
|
module Container
|
|
10
10
|
module Supervisor
|
|
11
|
+
# An environment mixin for supervised worker services.
|
|
12
|
+
#
|
|
13
|
+
# Enables workers to connect to and be supervised by the supervisor.
|
|
11
14
|
module Supervised
|
|
12
15
|
# The IPC path to use for communication with the supervisor.
|
|
13
16
|
# @returns [String]
|
|
@@ -21,6 +24,10 @@ module Async
|
|
|
21
24
|
::IO::Endpoint.unix(supervisor_ipc_path)
|
|
22
25
|
end
|
|
23
26
|
|
|
27
|
+
# Create a supervised worker for the given instance.
|
|
28
|
+
#
|
|
29
|
+
# @parameter instance [Async::Container::Instance] The container instance.
|
|
30
|
+
# @returns [Worker] The worker client.
|
|
24
31
|
def make_supervised_worker(instance)
|
|
25
32
|
Worker.new(instance, endpoint: supervisor_endpoint)
|
|
26
33
|
end
|
|
@@ -13,10 +13,18 @@ module Async
|
|
|
13
13
|
#
|
|
14
14
|
# There are various tasks that can be executed by the worker, such as dumping memory, threads, and garbage collection profiles.
|
|
15
15
|
class Worker < Client
|
|
16
|
+
# Run a worker with the given state.
|
|
17
|
+
#
|
|
18
|
+
# @parameter state [Hash] The worker state (e.g. process_id, instance info).
|
|
19
|
+
# @parameter endpoint [IO::Endpoint] The supervisor endpoint to connect to.
|
|
16
20
|
def self.run(...)
|
|
17
21
|
self.new(...).run
|
|
18
22
|
end
|
|
19
23
|
|
|
24
|
+
# Initialize a new worker.
|
|
25
|
+
#
|
|
26
|
+
# @parameter state [Hash] The worker state to register with the supervisor.
|
|
27
|
+
# @parameter endpoint [IO::Endpoint] The supervisor endpoint to connect to.
|
|
20
28
|
def initialize(state, endpoint: Supervisor.endpoint)
|
|
21
29
|
@state = state
|
|
22
30
|
@endpoint = endpoint
|
|
@@ -39,12 +47,25 @@ module Async
|
|
|
39
47
|
end
|
|
40
48
|
end
|
|
41
49
|
|
|
50
|
+
# Dump the current fiber scheduler hierarchy.
|
|
51
|
+
#
|
|
52
|
+
# Generates a hierarchical view of all running fibers and their relationships.
|
|
53
|
+
#
|
|
54
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
55
|
+
# @parameter call[:path] [String] Optional file path to save the dump.
|
|
42
56
|
def do_scheduler_dump(call)
|
|
43
57
|
dump(call) do |file|
|
|
44
58
|
Fiber.scheduler.print_hierarchy(file)
|
|
45
59
|
end
|
|
46
60
|
end
|
|
47
61
|
|
|
62
|
+
# Dump the entire object space to a file.
|
|
63
|
+
#
|
|
64
|
+
# This is a heavyweight operation that dumps all objects in the heap.
|
|
65
|
+
# Consider using {do_memory_sample} for lighter weight memory leak detection.
|
|
66
|
+
#
|
|
67
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
68
|
+
# @parameter call[:path] [String] Optional file path to save the dump.
|
|
48
69
|
def do_memory_dump(call)
|
|
49
70
|
require "objspace"
|
|
50
71
|
|
|
@@ -53,6 +74,58 @@ module Async
|
|
|
53
74
|
end
|
|
54
75
|
end
|
|
55
76
|
|
|
77
|
+
# Sample memory allocations over a time period to identify potential leaks.
|
|
78
|
+
#
|
|
79
|
+
# This method is much lighter weight than {do_memory_dump} and focuses on
|
|
80
|
+
# retained objects allocated during the sampling period. Late-lifecycle
|
|
81
|
+
# allocations that are retained are likely memory leaks.
|
|
82
|
+
#
|
|
83
|
+
# The method samples allocations for the specified duration, forces a garbage
|
|
84
|
+
# collection, and returns a JSON report showing allocated vs retained memory
|
|
85
|
+
# broken down by gem, file, location, and class.
|
|
86
|
+
#
|
|
87
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
88
|
+
# @parameter call[:duration] [Numeric] The duration in seconds to sample for.
|
|
89
|
+
def do_memory_sample(call)
|
|
90
|
+
require "memory"
|
|
91
|
+
|
|
92
|
+
unless duration = call[:duration] and duration.positive?
|
|
93
|
+
raise ArgumentError, "Positive duration is required!"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
Console.info(self, "Starting memory sampling...", duration: duration)
|
|
97
|
+
|
|
98
|
+
# Create a sampler to track allocations
|
|
99
|
+
sampler = Memory::Sampler.new
|
|
100
|
+
|
|
101
|
+
# Start sampling
|
|
102
|
+
sampler.start
|
|
103
|
+
|
|
104
|
+
# Sample for the specified duration
|
|
105
|
+
sleep(duration)
|
|
106
|
+
|
|
107
|
+
# Stop sampling
|
|
108
|
+
sampler.stop
|
|
109
|
+
|
|
110
|
+
report = sampler.report
|
|
111
|
+
|
|
112
|
+
# This is a temporary log to help with debugging:
|
|
113
|
+
buffer = StringIO.new
|
|
114
|
+
report.print(buffer)
|
|
115
|
+
Console.info(self, "Memory sample completed.", report: buffer.string)
|
|
116
|
+
|
|
117
|
+
# Generate a report focused on retained objects (likely leaks):
|
|
118
|
+
call.finish(report: report)
|
|
119
|
+
ensure
|
|
120
|
+
GC.start
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Dump information about all running threads.
|
|
124
|
+
#
|
|
125
|
+
# Includes thread inspection and backtraces for debugging.
|
|
126
|
+
#
|
|
127
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
128
|
+
# @parameter call[:path] [String] Optional file path to save the dump.
|
|
56
129
|
def do_thread_dump(call)
|
|
57
130
|
dump(call) do |file|
|
|
58
131
|
Thread.list.each do |thread|
|
|
@@ -62,17 +135,28 @@ module Async
|
|
|
62
135
|
end
|
|
63
136
|
end
|
|
64
137
|
|
|
138
|
+
# Start garbage collection profiling.
|
|
139
|
+
#
|
|
140
|
+
# Enables the GC profiler to track garbage collection performance.
|
|
141
|
+
#
|
|
142
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
65
143
|
def do_garbage_profile_start(call)
|
|
66
144
|
GC::Profiler.enable
|
|
67
145
|
call.finish(started: true)
|
|
68
146
|
end
|
|
69
147
|
|
|
148
|
+
# Stop garbage collection profiling and return results.
|
|
149
|
+
#
|
|
150
|
+
# Disables the GC profiler and returns collected profiling data.
|
|
151
|
+
#
|
|
152
|
+
# @parameter call [Connection::Call] The call to respond to.
|
|
153
|
+
# @parameter call[:path] [String] Optional file path to save the profile.
|
|
70
154
|
def do_garbage_profile_stop(call)
|
|
71
|
-
GC::Profiler.disable
|
|
72
|
-
|
|
73
155
|
dump(connection, message) do |file|
|
|
74
156
|
file.puts GC::Profiler.result
|
|
75
157
|
end
|
|
158
|
+
ensure
|
|
159
|
+
GC::Profiler.disable
|
|
76
160
|
end
|
|
77
161
|
|
|
78
162
|
protected def connected!(connection)
|
|
@@ -10,16 +10,7 @@ require_relative "supervisor/worker"
|
|
|
10
10
|
require_relative "supervisor/client"
|
|
11
11
|
|
|
12
12
|
require_relative "supervisor/memory_monitor"
|
|
13
|
+
require_relative "supervisor/process_monitor"
|
|
13
14
|
|
|
14
15
|
require_relative "supervisor/environment"
|
|
15
16
|
require_relative "supervisor/supervised"
|
|
16
|
-
|
|
17
|
-
# @namespace
|
|
18
|
-
module Async
|
|
19
|
-
# @namespace
|
|
20
|
-
module Container
|
|
21
|
-
# @namespace
|
|
22
|
-
module Supervisor
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
data/readme.md
CHANGED
|
@@ -18,10 +18,23 @@ Please see the [project documentation](https://socketry.github.io/async-containe
|
|
|
18
18
|
|
|
19
19
|
- [Getting Started](https://socketry.github.io/async-container-supervisor/guides/getting-started/index) - This guide explains how to get started with `async-container-supervisor` to supervise and monitor worker processes in your Ruby applications.
|
|
20
20
|
|
|
21
|
+
- [Memory Monitor](https://socketry.github.io/async-container-supervisor/guides/memory-monitor/index) - This guide explains how to use the <code class="language-ruby">Async::Container::Supervisor::MemoryMonitor</code> to detect and restart workers that exceed memory limits or develop memory leaks.
|
|
22
|
+
|
|
23
|
+
- [Process Monitor](https://socketry.github.io/async-container-supervisor/guides/process-monitor/index) - This guide explains how to use the <code class="language-ruby">Async::Container::Supervisor::ProcessMonitor</code> to log CPU and memory metrics for your worker processes.
|
|
24
|
+
|
|
21
25
|
## Releases
|
|
22
26
|
|
|
23
27
|
Please see the [project releases](https://socketry.github.io/async-container-supervisor/releases/index) for all releases.
|
|
24
28
|
|
|
29
|
+
### v0.8.0
|
|
30
|
+
|
|
31
|
+
- Add `Async::Container::Supervisor::ProcessMonitor` for logging CPU and memory metrics periodically.
|
|
32
|
+
- Fix documentation to use correct `maximum_size_limit:` parameter name for `MemoryMonitor` (was incorrectly documented as `limit:`).
|
|
33
|
+
|
|
34
|
+
### v0.7.0
|
|
35
|
+
|
|
36
|
+
- If a memory leak is detected, sample memory usage for 60 seconds before exiting.
|
|
37
|
+
|
|
25
38
|
### v0.6.4
|
|
26
39
|
|
|
27
40
|
- Make client task (in supervised worker) transient, so that it doesn't keep the reactor alive unnecessarily. It also won't be stopped by default when SIGINT is received, so that the worker will remain connected to the supervisor until the worker is completely terminated.
|
data/releases.md
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# Releases
|
|
2
2
|
|
|
3
|
+
## v0.8.0
|
|
4
|
+
|
|
5
|
+
- Add `Async::Container::Supervisor::ProcessMonitor` for logging CPU and memory metrics periodically.
|
|
6
|
+
- Fix documentation to use correct `maximum_size_limit:` parameter name for `MemoryMonitor` (was incorrectly documented as `limit:`).
|
|
7
|
+
|
|
8
|
+
## v0.7.0
|
|
9
|
+
|
|
10
|
+
- If a memory leak is detected, sample memory usage for 60 seconds before exiting.
|
|
11
|
+
|
|
3
12
|
## v0.6.4
|
|
4
13
|
|
|
5
14
|
- Make client task (in supervised worker) transient, so that it doesn't keep the reactor alive unnecessarily. It also won't be stopped by default when SIGINT is received, so that the worker will remain connected to the supervisor until the worker is completely terminated.
|
data.tar.gz.sig
CHANGED
|
Binary file
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: async-container-supervisor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.8.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Samuel Williams
|
|
@@ -66,6 +66,20 @@ dependencies:
|
|
|
66
66
|
- - ">="
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: memory
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0.7'
|
|
76
|
+
type: :runtime
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0.7'
|
|
69
83
|
- !ruby/object:Gem::Dependency
|
|
70
84
|
name: memory-leak
|
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -80,6 +94,20 @@ dependencies:
|
|
|
80
94
|
- - "~>"
|
|
81
95
|
- !ruby/object:Gem::Version
|
|
82
96
|
version: '0.5'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: process-metrics
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - ">="
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '0'
|
|
104
|
+
type: :runtime
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - ">="
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '0'
|
|
83
111
|
executables: []
|
|
84
112
|
extensions: []
|
|
85
113
|
extra_rdoc_files: []
|
|
@@ -87,6 +115,8 @@ files:
|
|
|
87
115
|
- bake/async/container/supervisor.rb
|
|
88
116
|
- context/getting-started.md
|
|
89
117
|
- context/index.yaml
|
|
118
|
+
- context/memory-monitor.md
|
|
119
|
+
- context/process-monitor.md
|
|
90
120
|
- lib/async/container/supervisor.rb
|
|
91
121
|
- lib/async/container/supervisor/client.rb
|
|
92
122
|
- lib/async/container/supervisor/connection.rb
|
|
@@ -94,6 +124,7 @@ files:
|
|
|
94
124
|
- lib/async/container/supervisor/endpoint.rb
|
|
95
125
|
- lib/async/container/supervisor/environment.rb
|
|
96
126
|
- lib/async/container/supervisor/memory_monitor.rb
|
|
127
|
+
- lib/async/container/supervisor/process_monitor.rb
|
|
97
128
|
- lib/async/container/supervisor/server.rb
|
|
98
129
|
- lib/async/container/supervisor/service.rb
|
|
99
130
|
- lib/async/container/supervisor/supervised.rb
|
metadata.gz.sig
CHANGED
|
Binary file
|