crazy-workers 1.4.2__tar.gz → 1.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/PKG-INFO +21 -6
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/README.md +20 -5
- crazy_workers-1.5.1/crazy_workers/__init__.py +24 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/status.py +22 -2
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/engine.py +76 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/__init__.py +2 -2
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/starter.py +10 -5
- crazy_workers-1.5.1/crazy_workers/daemon/reconciler.py +181 -0
- crazy_workers-1.5.1/crazy_workers/params.py +45 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/PKG-INFO +21 -6
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/SOURCES.txt +3 -1
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/pyproject.toml +2 -2
- crazy_workers-1.5.1/tests/test_params.py +51 -0
- crazy_workers-1.4.2/crazy_workers/__init__.py +0 -6
- crazy_workers-1.4.2/crazy_workers/daemon/reconciler.py +0 -118
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/LICENSE +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/_bootstrap.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/__main__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/base.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/detect.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/entry.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/orchestrator.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/systemd.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/windows.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/params.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/starter.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/stopper.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/discovery.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/main.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/ui.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/client.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/backend.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/lister.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/recoverer.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/stopper.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/recovery.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/__main__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/runner.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/schema.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/storage.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/__init__.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/backends.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/polling.py +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/dependency_links.txt +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/entry_points.txt +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/requires.txt +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/top_level.txt +0 -0
- {crazy_workers-1.4.2 → crazy_workers-1.5.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crazy-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.1
|
|
4
4
|
Summary: A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI.
|
|
5
5
|
Author: GioVanni Colasanto
|
|
6
6
|
License: MIT
|
|
@@ -76,9 +76,11 @@ pip install .
|
|
|
76
76
|
|
|
77
77
|
```python
|
|
78
78
|
# workers/my_worker.py
|
|
79
|
-
import
|
|
79
|
+
import time
|
|
80
80
|
|
|
81
|
-
|
|
81
|
+
from crazy_workers import parse_params
|
|
82
|
+
|
|
83
|
+
params = parse_params()
|
|
82
84
|
duration = params.get('duration', 60)
|
|
83
85
|
|
|
84
86
|
for _ in range(duration):
|
|
@@ -176,15 +178,28 @@ Closes the database connection and clears internal process references. Does **no
|
|
|
176
178
|
|
|
177
179
|
## Worker Script Contract
|
|
178
180
|
|
|
179
|
-
A worker receives its parameters as a JSON string in `sys.argv[1]
|
|
181
|
+
A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
|
|
182
|
+
`parse_params` instead of decoding it by hand:
|
|
180
183
|
|
|
181
184
|
```python
|
|
182
|
-
import
|
|
185
|
+
from crazy_workers import parse_params
|
|
183
186
|
|
|
184
|
-
params =
|
|
187
|
+
params = parse_params()
|
|
185
188
|
# ... do work ...
|
|
186
189
|
```
|
|
187
190
|
|
|
191
|
+
Without arguments it is lenient: a worker launched with no parameters gets
|
|
192
|
+
`{}`. Pass `required=` to abort before any real work when a parameter the
|
|
193
|
+
worker cannot run without is missing or empty — the worker exits with code 1
|
|
194
|
+
and a message on stderr:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
params = parse_params(required=('device_id', 'output_dir'))
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
`parse_params(argv=...)` accepts an explicit argv list, which makes the
|
|
201
|
+
parsing trivially testable without patching `sys.argv`.
|
|
202
|
+
|
|
188
203
|
A worker is a separate process, so it cannot be handed a live object (e.g. a DB
|
|
189
204
|
connection). Pass **configuration** instead: the manager's `worker_env` (and any
|
|
190
205
|
per-call `env`) is injected as environment variables, so a worker reads, say,
|
|
@@ -41,9 +41,11 @@ pip install .
|
|
|
41
41
|
|
|
42
42
|
```python
|
|
43
43
|
# workers/my_worker.py
|
|
44
|
-
import
|
|
44
|
+
import time
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
from crazy_workers import parse_params
|
|
47
|
+
|
|
48
|
+
params = parse_params()
|
|
47
49
|
duration = params.get('duration', 60)
|
|
48
50
|
|
|
49
51
|
for _ in range(duration):
|
|
@@ -141,15 +143,28 @@ Closes the database connection and clears internal process references. Does **no
|
|
|
141
143
|
|
|
142
144
|
## Worker Script Contract
|
|
143
145
|
|
|
144
|
-
A worker receives its parameters as a JSON string in `sys.argv[1]
|
|
146
|
+
A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
|
|
147
|
+
`parse_params` instead of decoding it by hand:
|
|
145
148
|
|
|
146
149
|
```python
|
|
147
|
-
import
|
|
150
|
+
from crazy_workers import parse_params
|
|
148
151
|
|
|
149
|
-
params =
|
|
152
|
+
params = parse_params()
|
|
150
153
|
# ... do work ...
|
|
151
154
|
```
|
|
152
155
|
|
|
156
|
+
Without arguments it is lenient: a worker launched with no parameters gets
|
|
157
|
+
`{}`. Pass `required=` to abort before any real work when a parameter the
|
|
158
|
+
worker cannot run without is missing or empty — the worker exits with code 1
|
|
159
|
+
and a message on stderr:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
params = parse_params(required=('device_id', 'output_dir'))
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
`parse_params(argv=...)` accepts an explicit argv list, which makes the
|
|
166
|
+
parsing trivially testable without patching `sys.argv`.
|
|
167
|
+
|
|
153
168
|
A worker is a separate process, so it cannot be handed a live object (e.g. a DB
|
|
154
169
|
connection). Pass **configuration** instead: the manager's `worker_env` (and any
|
|
155
170
|
per-call `env`) is injected as environment variables, so a worker reads, say,
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
__all__ = ['WorkerManager', 'WorkerClient', 'WorkerStatus', 'DesiredStatus', 'parse_params']
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# Resolve exports lazily (PEP 562) so a worker doing `from crazy_workers import
|
|
5
|
+
# parse_params` only imports the lightweight params module — not the control
|
|
6
|
+
# plane (WorkerClient/WorkerManager) and its heavy SQLAlchemy dependencies.
|
|
7
|
+
def __getattr__(name):
|
|
8
|
+
if name == 'parse_params':
|
|
9
|
+
from .params import parse_params
|
|
10
|
+
|
|
11
|
+
return parse_params
|
|
12
|
+
if name == 'WorkerManager':
|
|
13
|
+
from .core.manager import WorkerManager
|
|
14
|
+
|
|
15
|
+
return WorkerManager
|
|
16
|
+
if name == 'WorkerClient':
|
|
17
|
+
from .client import WorkerClient
|
|
18
|
+
|
|
19
|
+
return WorkerClient
|
|
20
|
+
if name in ('DesiredStatus', 'WorkerStatus'):
|
|
21
|
+
from .database import schema
|
|
22
|
+
|
|
23
|
+
return getattr(schema, name)
|
|
24
|
+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
|
@@ -6,12 +6,13 @@ from datetime import datetime
|
|
|
6
6
|
from rich.panel import Panel
|
|
7
7
|
from rich.table import Table
|
|
8
8
|
|
|
9
|
+
from ...core.engine import resolve_system_pid
|
|
9
10
|
from ..ui import console
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def show_status(client, workers_dir, json_mode=False):
|
|
13
14
|
"""Observability hub: the target state store plus the worker table (desired vs actual)."""
|
|
14
|
-
workers = _merge_with_filesystem(client.list(), workers_dir)
|
|
15
|
+
workers = _with_system_pids(_merge_with_filesystem(client.list(), workers_dir))
|
|
15
16
|
|
|
16
17
|
if json_mode:
|
|
17
18
|
sys.stdout.write(json.dumps({'workers': workers}) + '\n')
|
|
@@ -70,6 +71,25 @@ def _merge_with_filesystem(db_workers, workers_dir):
|
|
|
70
71
|
return results
|
|
71
72
|
|
|
72
73
|
|
|
74
|
+
def _with_system_pids(workers):
|
|
75
|
+
results = []
|
|
76
|
+
for worker in workers:
|
|
77
|
+
enriched = dict(worker)
|
|
78
|
+
enriched['system_pid'] = resolve_system_pid(enriched.get('pid'), worker_key=enriched.get('worker_key'))
|
|
79
|
+
results.append(enriched)
|
|
80
|
+
return results
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _format_pid(worker):
|
|
84
|
+
pid = worker.get('pid')
|
|
85
|
+
system_pid = worker.get('system_pid')
|
|
86
|
+
if not pid:
|
|
87
|
+
return '-'
|
|
88
|
+
if system_pid and system_pid != pid:
|
|
89
|
+
return f'{system_pid} [dim](ns {pid})[/dim]'
|
|
90
|
+
return str(pid)
|
|
91
|
+
|
|
92
|
+
|
|
73
93
|
def _build_table(workers):
|
|
74
94
|
table = Table(
|
|
75
95
|
title='[bold cyan]Workers — desired vs actual[/bold cyan]', border_style='cyan', header_style='bold magenta'
|
|
@@ -114,7 +134,7 @@ def _build_table(workers):
|
|
|
114
134
|
w['worker_type'],
|
|
115
135
|
f'[{desired_style}]{desired}[/{desired_style}]',
|
|
116
136
|
f'[{status_style}]{status}[/{status_style}]',
|
|
117
|
-
|
|
137
|
+
_format_pid(w),
|
|
118
138
|
last_action,
|
|
119
139
|
params_str,
|
|
120
140
|
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
import psutil
|
|
3
4
|
import subprocess
|
|
4
5
|
|
|
@@ -64,6 +65,81 @@ def is_process_running(pid):
|
|
|
64
65
|
return False
|
|
65
66
|
|
|
66
67
|
|
|
68
|
+
def resolve_system_pid(pid, worker_key=None):
|
|
69
|
+
"""Return the most-native PID visible for ``pid``.
|
|
70
|
+
|
|
71
|
+
``pid`` remains the control PID used by the current process namespace. On
|
|
72
|
+
Linux, ``CRAZY_WORKERS_HOST_PROC`` can point at a read-only host procfs mount
|
|
73
|
+
(for example /host/proc in Docker) so status can show the host PID. Without
|
|
74
|
+
that mount, /proc exposes NSpid when PID namespaces are visible; its first
|
|
75
|
+
value is the PID in the outermost namespace visible to this procfs mount. On
|
|
76
|
+
Windows and ordinary Linux hosts this is just ``pid``.
|
|
77
|
+
"""
|
|
78
|
+
if pid is None:
|
|
79
|
+
return None
|
|
80
|
+
if os.name != 'posix':
|
|
81
|
+
return pid
|
|
82
|
+
|
|
83
|
+
host_pid = _resolve_from_host_proc(pid, worker_key=worker_key)
|
|
84
|
+
if host_pid is not None:
|
|
85
|
+
return host_pid
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
with open(f'/proc/{pid}/status', encoding='utf-8') as f:
|
|
89
|
+
for line in f:
|
|
90
|
+
if line.startswith('NSpid:'):
|
|
91
|
+
values = [int(value) for value in line.split()[1:]]
|
|
92
|
+
return values[0] if values else pid
|
|
93
|
+
except (OSError, ValueError):
|
|
94
|
+
return pid
|
|
95
|
+
|
|
96
|
+
return pid
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _resolve_from_host_proc(pid, worker_key=None):
|
|
100
|
+
host_proc = os.environ.get('CRAZY_WORKERS_HOST_PROC')
|
|
101
|
+
if not host_proc:
|
|
102
|
+
return None
|
|
103
|
+
if not os.path.isdir(host_proc):
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
for entry in os.listdir(host_proc):
|
|
107
|
+
if not entry.isdigit():
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
status_path = os.path.join(host_proc, entry, 'status')
|
|
111
|
+
try:
|
|
112
|
+
with open(status_path, encoding='utf-8') as f:
|
|
113
|
+
nspid = _read_nspid(f)
|
|
114
|
+
except (OSError, ValueError):
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if not nspid or nspid[-1] != pid:
|
|
118
|
+
continue
|
|
119
|
+
if worker_key and not _host_proc_cmdline_matches(host_proc, entry, worker_key):
|
|
120
|
+
continue
|
|
121
|
+
return nspid[0]
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _read_nspid(lines):
|
|
127
|
+
for line in lines:
|
|
128
|
+
if line.startswith('NSpid:'):
|
|
129
|
+
return [int(value) for value in line.split()[1:]]
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _host_proc_cmdline_matches(host_proc, host_pid, worker_key):
|
|
134
|
+
try:
|
|
135
|
+
with open(os.path.join(host_proc, host_pid, 'cmdline'), 'rb') as f:
|
|
136
|
+
raw = f.read()
|
|
137
|
+
except OSError:
|
|
138
|
+
return False
|
|
139
|
+
parts = [part.decode(errors='ignore') for part in raw.split(b'\0') if part]
|
|
140
|
+
return worker_key_token(worker_key) in parts
|
|
141
|
+
|
|
142
|
+
|
|
67
143
|
def terminate_process(pid, timeout=5, popen_process=None, exclude_pids=None):
|
|
68
144
|
"""Gracefully terminates a process and its non-managed children.
|
|
69
145
|
|
|
@@ -107,8 +107,8 @@ class WorkerManager:
|
|
|
107
107
|
else:
|
|
108
108
|
self.storage = None
|
|
109
109
|
|
|
110
|
-
def start_worker(self, worker_type, worker_key=None, parameters=None, env=None):
|
|
111
|
-
return start_worker(self, worker_type, worker_key, parameters, env)
|
|
110
|
+
def start_worker(self, worker_type, worker_key=None, parameters=None, env=None, reset_backoff=True):
|
|
111
|
+
return start_worker(self, worker_type, worker_key, parameters, env, reset_backoff)
|
|
112
112
|
|
|
113
113
|
def stop_worker(self, worker_key):
|
|
114
114
|
return stop_worker(self, worker_key)
|
|
@@ -19,7 +19,7 @@ logger = logging.getLogger('crazy_workers')
|
|
|
19
19
|
_SAFE_NAME = re.compile(r'[A-Za-z0-9_-]+')
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def start_worker(manager, worker_type, worker_key=None, parameters=None, env=None):
|
|
22
|
+
def start_worker(manager, worker_type, worker_key=None, parameters=None, env=None, reset_backoff=True):
|
|
23
23
|
if not manager.storage:
|
|
24
24
|
return False, 'System not initialized (database missing)'
|
|
25
25
|
|
|
@@ -43,7 +43,7 @@ def start_worker(manager, worker_type, worker_key=None, parameters=None, env=Non
|
|
|
43
43
|
worker.status = WorkerStatus.STOPPED
|
|
44
44
|
return False, f'Worker file {worker_type}.py not found'
|
|
45
45
|
|
|
46
|
-
return _spawn_worker_process(manager, worker, worker_path, parameters, env, session)
|
|
46
|
+
return _spawn_worker_process(manager, worker, worker_path, parameters, env, session, reset_backoff)
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
def _validate_inputs(worker_type, worker_key):
|
|
@@ -107,7 +107,7 @@ def _get_worker_script_path(manager, worker_type):
|
|
|
107
107
|
return worker_path
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
def _spawn_worker_process(manager, worker, worker_path, parameters, env, session):
|
|
110
|
+
def _spawn_worker_process(manager, worker, worker_path, parameters, env, session, reset_backoff=True):
|
|
111
111
|
log_file_path = os.path.join(manager.logs_dir, f'{worker.worker_key}.log')
|
|
112
112
|
|
|
113
113
|
# The manager's worker_env is the baseline for every worker (e.g. the host
|
|
@@ -132,8 +132,13 @@ def _spawn_worker_process(manager, worker, worker_path, parameters, env, session
|
|
|
132
132
|
worker.pid = handle.pid
|
|
133
133
|
worker.status = WorkerStatus.RUNNING
|
|
134
134
|
worker.last_started_at = func.now()
|
|
135
|
-
# A clean start clears the crash backoff so the
|
|
136
|
-
|
|
135
|
+
# A clean start (CLI/client/manual restart) clears the crash backoff so the
|
|
136
|
+
# next failure starts over. An automatic reconciler restart passes
|
|
137
|
+
# reset_backoff=False: a fast crash-loop must keep accumulating restart_count
|
|
138
|
+
# across respawns, otherwise the exponential backoff can never escalate. The
|
|
139
|
+
# reconciler instead resets the counter once the worker is observed healthy.
|
|
140
|
+
if reset_backoff:
|
|
141
|
+
worker.restart_count = 0
|
|
137
142
|
session.commit()
|
|
138
143
|
|
|
139
144
|
manager._active_processes[worker.worker_key] = handle
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from datetime import datetime, timedelta, timezone
|
|
4
|
+
|
|
5
|
+
from ..database.schema import DesiredStatus, Worker, WorkerStatus
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger('crazy_workers')
|
|
9
|
+
|
|
10
|
+
_BACKOFF_BASE_SECONDS = 1
|
|
11
|
+
_BACKOFF_MAX_SECONDS = 60
|
|
12
|
+
# Cap the exponent so a long-crashed worker doesn't compute an astronomically
|
|
13
|
+
# large intermediate before min() clamps it.
|
|
14
|
+
_BACKOFF_MAX_EXPONENT = 16
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Reconciler:
|
|
18
|
+
"""Single-owner loop: drives actual worker state toward desired state.
|
|
19
|
+
|
|
20
|
+
Owns every worker process for one workers_dir/DB. Clients never spawn; they
|
|
21
|
+
only set desired_status in the shared DB and this loop makes it so.
|
|
22
|
+
|
|
23
|
+
| desired | alive | observed | action |
|
|
24
|
+
|----------|-------|-----------------|--------------------------------------------|
|
|
25
|
+
| RUNNING | no | RUNNING | record crash (mark CRASHED, +restart_count)|
|
|
26
|
+
| RUNNING | no | other | start (skipped while in backoff) |
|
|
27
|
+
| RUNNING | yes | RUNNING | noop (reset backoff once proven healthy) |
|
|
28
|
+
| RUNNING | yes | other | heal observed status to RUNNING |
|
|
29
|
+
| STOPPED | yes | - | stop |
|
|
30
|
+
| STOPPED | no | RUNNING/CRASHED | heal stale observed status to STOPPED |
|
|
31
|
+
| STOPPED | no | STOPPED | noop |
|
|
32
|
+
|
|
33
|
+
Crash detection is split from the restart on purpose. A worker that dies the
|
|
34
|
+
instant it starts comes back from ``start_worker`` marked RUNNING; if the same
|
|
35
|
+
pass that noticed the dead PID respawned it, its status would flip to RUNNING
|
|
36
|
+
again before the backoff branch ever saw CRASHED, and an instantly-crashing
|
|
37
|
+
worker would respawn at full loop speed forever. Instead the first pass records
|
|
38
|
+
the death (CRASHED + a bumped restart_count), and a later pass restarts it once
|
|
39
|
+
the exponential backoff has elapsed.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, manager, interval=2.0):
|
|
43
|
+
self.manager = manager
|
|
44
|
+
self.interval = interval
|
|
45
|
+
self._stop = False
|
|
46
|
+
|
|
47
|
+
def run_forever(self):
|
|
48
|
+
logger.info('Reconciler started (interval=%ss)', self.interval)
|
|
49
|
+
while not self._stop:
|
|
50
|
+
try:
|
|
51
|
+
self.reconcile_once()
|
|
52
|
+
except Exception:
|
|
53
|
+
logger.exception('Reconcile pass failed; continuing.')
|
|
54
|
+
# Sleep in small slices so a SIGTERM-triggered stop is honoured promptly
|
|
55
|
+
# instead of after a full interval.
|
|
56
|
+
self._interruptible_sleep(self.interval)
|
|
57
|
+
logger.info('Reconciler stopped.')
|
|
58
|
+
|
|
59
|
+
def stop(self):
|
|
60
|
+
self._stop = True
|
|
61
|
+
|
|
62
|
+
def _interruptible_sleep(self, seconds):
|
|
63
|
+
deadline = time.monotonic() + seconds
|
|
64
|
+
while not self._stop and time.monotonic() < deadline:
|
|
65
|
+
time.sleep(min(0.2, deadline - time.monotonic()))
|
|
66
|
+
|
|
67
|
+
def reconcile_once(self):
|
|
68
|
+
"""One pass over every worker. Returns the actions taken (for tests/observability)."""
|
|
69
|
+
actions = []
|
|
70
|
+
for row in self._load_snapshot():
|
|
71
|
+
action = self._reconcile_worker(row)
|
|
72
|
+
if action:
|
|
73
|
+
actions.append((row['worker_key'], action))
|
|
74
|
+
return actions
|
|
75
|
+
|
|
76
|
+
def _load_snapshot(self):
|
|
77
|
+
# Read everything we need into plain dicts and release the session before
|
|
78
|
+
# touching processes — start/stop open their own short-lived sessions.
|
|
79
|
+
with self.manager.storage.session_scope() as session:
|
|
80
|
+
return [
|
|
81
|
+
{
|
|
82
|
+
'worker_key': w.worker_key,
|
|
83
|
+
'worker_type': w.worker_type,
|
|
84
|
+
'parameters': w.parameters,
|
|
85
|
+
'pid': w.pid,
|
|
86
|
+
'desired': w.desired_status,
|
|
87
|
+
'status': w.status,
|
|
88
|
+
'restart_count': w.restart_count,
|
|
89
|
+
'last_exit_at': w.last_exit_at,
|
|
90
|
+
}
|
|
91
|
+
for w in session.query(Worker).all()
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
def _reconcile_worker(self, row):
|
|
95
|
+
alive = self.manager.backend.is_alive(pid=row['pid'], worker_key=row['worker_key'])
|
|
96
|
+
|
|
97
|
+
if row['desired'] == DesiredStatus.RUNNING and not alive:
|
|
98
|
+
if row['status'] == WorkerStatus.RUNNING:
|
|
99
|
+
# We last saw it RUNNING but the process is gone: it crashed. Record the
|
|
100
|
+
# death now and let a later pass restart it — see the class docstring for
|
|
101
|
+
# why detection and restart are split across passes.
|
|
102
|
+
logger.warning('Reconcile: worker %s died; recording crash', row['worker_key'])
|
|
103
|
+
self._record_crash(row['worker_key'])
|
|
104
|
+
return 'crashed'
|
|
105
|
+
if self._in_backoff(row):
|
|
106
|
+
return None
|
|
107
|
+
logger.info('Reconcile: starting %s', row['worker_key'])
|
|
108
|
+
# reset_backoff=False: an automatic restart must not wipe the crash history
|
|
109
|
+
# a fast crash-loop is accumulating, or the backoff could never escalate.
|
|
110
|
+
self.manager.start_worker(row['worker_type'], row['worker_key'], row['parameters'], reset_backoff=False)
|
|
111
|
+
return 'start'
|
|
112
|
+
if row['desired'] == DesiredStatus.STOPPED:
|
|
113
|
+
if alive:
|
|
114
|
+
logger.info('Reconcile: stopping %s', row['worker_key'])
|
|
115
|
+
self.manager.stop_worker(row['worker_key'])
|
|
116
|
+
return 'stop'
|
|
117
|
+
if row['status'] not in (WorkerStatus.STOPPED, WorkerStatus.NEVER_STARTED):
|
|
118
|
+
# Desired down and the process is already gone, but the observed status is
|
|
119
|
+
# stale — RUNNING left by the last spawn, or CRASHED. Nothing kills the
|
|
120
|
+
# process (it is dead) and stop_worker only handles a RUNNING worker, so
|
|
121
|
+
# converge the observed status here, or the table shows a phantom
|
|
122
|
+
# RUNNING/CRASHED with a dead PID forever.
|
|
123
|
+
self._mark_stopped(row['worker_key'])
|
|
124
|
+
return 'mark_stopped'
|
|
125
|
+
return None
|
|
126
|
+
if row['desired'] == DesiredStatus.RUNNING and alive:
|
|
127
|
+
if row['status'] != WorkerStatus.RUNNING:
|
|
128
|
+
# Process is up but the observed status drifted (e.g. left STARTING). Heal it.
|
|
129
|
+
self._mark_running(row['worker_key'])
|
|
130
|
+
return 'mark_running'
|
|
131
|
+
if row['restart_count']:
|
|
132
|
+
# Alive a full pass after its last spawn — proven healthy. Clear the crash
|
|
133
|
+
# backoff so a future crash starts counting (and backing off) from zero.
|
|
134
|
+
self._reset_backoff(row['worker_key'])
|
|
135
|
+
return 'reset_backoff'
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def _in_backoff(self, row):
|
|
139
|
+
if not row['last_exit_at'] or row['status'] != WorkerStatus.CRASHED:
|
|
140
|
+
return False
|
|
141
|
+
exponent = min(row['restart_count'], _BACKOFF_MAX_EXPONENT)
|
|
142
|
+
delay = min(_BACKOFF_BASE_SECONDS * (2**exponent), _BACKOFF_MAX_SECONDS)
|
|
143
|
+
last_exit = row['last_exit_at']
|
|
144
|
+
# last_exit_at is stored as UTC wall-clock; coerce naive values read back
|
|
145
|
+
# from the DB to aware UTC so the comparison never mixes naive and aware.
|
|
146
|
+
if last_exit.tzinfo is None:
|
|
147
|
+
last_exit = last_exit.replace(tzinfo=timezone.utc)
|
|
148
|
+
return datetime.now(timezone.utc) < last_exit + timedelta(seconds=delay)
|
|
149
|
+
|
|
150
|
+
def _mark_running(self, worker_key):
|
|
151
|
+
with self.manager.storage.session_scope() as session:
|
|
152
|
+
worker = session.query(Worker).filter_by(worker_key=worker_key).first()
|
|
153
|
+
if worker:
|
|
154
|
+
worker.status = WorkerStatus.RUNNING
|
|
155
|
+
|
|
156
|
+
def _record_crash(self, worker_key):
|
|
157
|
+
# Persist the death so the next pass's backoff gate can see it: mark CRASHED,
|
|
158
|
+
# bump the restart counter and stamp the exit time in UTC (Python-side, so the
|
|
159
|
+
# backoff math does not depend on the DB dialect's now()/timezone semantics).
|
|
160
|
+
# pid is cleared so we stop probing a dead (and potentially reused) PID.
|
|
161
|
+
with self.manager.storage.session_scope() as session:
|
|
162
|
+
worker = session.query(Worker).filter_by(worker_key=worker_key).first()
|
|
163
|
+
if worker:
|
|
164
|
+
worker.status = WorkerStatus.CRASHED
|
|
165
|
+
worker.last_exit_at = datetime.now(timezone.utc)
|
|
166
|
+
worker.restart_count = (worker.restart_count or 0) + 1
|
|
167
|
+
worker.pid = None
|
|
168
|
+
|
|
169
|
+
def _reset_backoff(self, worker_key):
|
|
170
|
+
with self.manager.storage.session_scope() as session:
|
|
171
|
+
worker = session.query(Worker).filter_by(worker_key=worker_key).first()
|
|
172
|
+
if worker:
|
|
173
|
+
worker.restart_count = 0
|
|
174
|
+
|
|
175
|
+
def _mark_stopped(self, worker_key):
|
|
176
|
+
with self.manager.storage.session_scope() as session:
|
|
177
|
+
worker = session.query(Worker).filter_by(worker_key=worker_key).first()
|
|
178
|
+
if worker:
|
|
179
|
+
worker.status = WorkerStatus.STOPPED
|
|
180
|
+
worker.pid = None
|
|
181
|
+
worker.last_stopped_at = datetime.now(timezone.utc)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Worker-side parsing of the JSON parameters the daemon passes on argv.
|
|
2
|
+
|
|
3
|
+
The manager spawns every worker as ``python -m crazy_workers._bootstrap
|
|
4
|
+
<worker_path> <json_params>`` and the bootstrap restores ``sys.argv`` to
|
|
5
|
+
``[worker_path, json_params]``. This module is the worker-side counterpart of
|
|
6
|
+
that contract: call :func:`parse_params` at the top of a worker's ``main()``
|
|
7
|
+
instead of re-implementing the argv/JSON boilerplate in every worker script.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_params(argv=None, required=()):
|
|
15
|
+
"""Return the parameters dict the daemon passed to this worker.
|
|
16
|
+
|
|
17
|
+
Without ``required`` a worker launched with no parameters gets ``{}`` — the
|
|
18
|
+
lenient mode for workers whose parameters are all optional. With ``required``
|
|
19
|
+
a missing argv, malformed JSON, or a missing/empty required key aborts the
|
|
20
|
+
worker with a message on stderr (``SystemExit``, exit code 1) before it does
|
|
21
|
+
any real work. A key counts as missing when it is absent, ``None``, or an
|
|
22
|
+
empty string; a falsy-but-present value like ``0`` or ``False`` is kept.
|
|
23
|
+
|
|
24
|
+
``argv`` defaults to ``sys.argv``; pass a list explicitly in tests.
|
|
25
|
+
"""
|
|
26
|
+
argv = sys.argv if argv is None else argv
|
|
27
|
+
|
|
28
|
+
if len(argv) < 2:
|
|
29
|
+
if required:
|
|
30
|
+
raise SystemExit(f'Missing required parameters: {", ".join(required)}')
|
|
31
|
+
return {}
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
params = json.loads(argv[1])
|
|
35
|
+
except json.JSONDecodeError as error:
|
|
36
|
+
raise SystemExit(f'Invalid JSON parameters: {error}')
|
|
37
|
+
|
|
38
|
+
if not isinstance(params, dict):
|
|
39
|
+
raise SystemExit('Invalid JSON parameters: expected a JSON object')
|
|
40
|
+
|
|
41
|
+
missing = [key for key in required if key not in params or params[key] in (None, '')]
|
|
42
|
+
if missing:
|
|
43
|
+
raise SystemExit(f'Missing required parameters: {", ".join(missing)}')
|
|
44
|
+
|
|
45
|
+
return params
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crazy-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.1
|
|
4
4
|
Summary: A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI.
|
|
5
5
|
Author: GioVanni Colasanto
|
|
6
6
|
License: MIT
|
|
@@ -76,9 +76,11 @@ pip install .
|
|
|
76
76
|
|
|
77
77
|
```python
|
|
78
78
|
# workers/my_worker.py
|
|
79
|
-
import
|
|
79
|
+
import time
|
|
80
80
|
|
|
81
|
-
|
|
81
|
+
from crazy_workers import parse_params
|
|
82
|
+
|
|
83
|
+
params = parse_params()
|
|
82
84
|
duration = params.get('duration', 60)
|
|
83
85
|
|
|
84
86
|
for _ in range(duration):
|
|
@@ -176,15 +178,28 @@ Closes the database connection and clears internal process references. Does **no
|
|
|
176
178
|
|
|
177
179
|
## Worker Script Contract
|
|
178
180
|
|
|
179
|
-
A worker receives its parameters as a JSON string in `sys.argv[1]
|
|
181
|
+
A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
|
|
182
|
+
`parse_params` instead of decoding it by hand:
|
|
180
183
|
|
|
181
184
|
```python
|
|
182
|
-
import
|
|
185
|
+
from crazy_workers import parse_params
|
|
183
186
|
|
|
184
|
-
params =
|
|
187
|
+
params = parse_params()
|
|
185
188
|
# ... do work ...
|
|
186
189
|
```
|
|
187
190
|
|
|
191
|
+
Without arguments it is lenient: a worker launched with no parameters gets
|
|
192
|
+
`{}`. Pass `required=` to abort before any real work when a parameter the
|
|
193
|
+
worker cannot run without is missing or empty — the worker exits with code 1
|
|
194
|
+
and a message on stderr:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
params = parse_params(required=('device_id', 'output_dir'))
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
`parse_params(argv=...)` accepts an explicit argv list, which makes the
|
|
201
|
+
parsing trivially testable without patching `sys.argv`.
|
|
202
|
+
|
|
188
203
|
A worker is a separate process, so it cannot be handed a live object (e.g. a DB
|
|
189
204
|
connection). Pass **configuration** instead: the manager's `worker_env` (and any
|
|
190
205
|
per-call `env`) is injected as environment variables, so a worker reads, say,
|
|
@@ -4,6 +4,7 @@ pyproject.toml
|
|
|
4
4
|
crazy_workers/__init__.py
|
|
5
5
|
crazy_workers/_bootstrap.py
|
|
6
6
|
crazy_workers/client.py
|
|
7
|
+
crazy_workers/params.py
|
|
7
8
|
crazy_workers.egg-info/PKG-INFO
|
|
8
9
|
crazy_workers.egg-info/SOURCES.txt
|
|
9
10
|
crazy_workers.egg-info/dependency_links.txt
|
|
@@ -45,4 +46,5 @@ crazy_workers/database/schema.py
|
|
|
45
46
|
crazy_workers/database/storage.py
|
|
46
47
|
crazy_workers/testing/__init__.py
|
|
47
48
|
crazy_workers/testing/backends.py
|
|
48
|
-
crazy_workers/testing/polling.py
|
|
49
|
+
crazy_workers/testing/polling.py
|
|
50
|
+
tests/test_params.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "crazy-workers"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.5.1"
|
|
4
4
|
description = "A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{ name = "GioVanni Colasanto" }]
|
|
@@ -63,7 +63,7 @@ omit = ["*/venv/*", "*/build/*", "tests/*"]
|
|
|
63
63
|
[tool.coverage.report]
|
|
64
64
|
show_missing = true
|
|
65
65
|
skip_covered = false
|
|
66
|
-
fail_under =
|
|
66
|
+
fail_under = 90
|
|
67
67
|
|
|
68
68
|
[tool.ruff]
|
|
69
69
|
line-length = 120
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch
|
|
3
|
+
|
|
4
|
+
from crazy_workers import parse_params
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestParseParams(unittest.TestCase):
|
|
8
|
+
def test_no_argv_returns_empty_dict(self):
|
|
9
|
+
self.assertEqual(parse_params(['worker.py']), {})
|
|
10
|
+
|
|
11
|
+
def test_no_argv_with_required_exits(self):
|
|
12
|
+
with self.assertRaises(SystemExit) as ctx:
|
|
13
|
+
parse_params(['worker.py'], required=('device_id', 'output_dir'))
|
|
14
|
+
self.assertEqual(ctx.exception.code, 'Missing required parameters: device_id, output_dir')
|
|
15
|
+
|
|
16
|
+
def test_invalid_json_exits(self):
|
|
17
|
+
with self.assertRaises(SystemExit) as ctx:
|
|
18
|
+
parse_params(['worker.py', 'not json'])
|
|
19
|
+
self.assertIn('Invalid JSON parameters', str(ctx.exception.code))
|
|
20
|
+
|
|
21
|
+
def test_non_object_json_exits(self):
|
|
22
|
+
with self.assertRaises(SystemExit) as ctx:
|
|
23
|
+
parse_params(['worker.py', '[1, 2, 3]'])
|
|
24
|
+
self.assertEqual(ctx.exception.code, 'Invalid JSON parameters: expected a JSON object')
|
|
25
|
+
|
|
26
|
+
def test_returns_parsed_params(self):
|
|
27
|
+
self.assertEqual(parse_params(['worker.py', '{"device_id": "7"}']), {'device_id': '7'})
|
|
28
|
+
|
|
29
|
+
def test_required_present_returns_params(self):
|
|
30
|
+
argv = ['worker.py', '{"device_id": "7", "output_dir": "/out"}']
|
|
31
|
+
params = parse_params(argv, required=('device_id', 'output_dir'))
|
|
32
|
+
self.assertEqual(params, {'device_id': '7', 'output_dir': '/out'})
|
|
33
|
+
|
|
34
|
+
def test_required_missing_exits_naming_keys(self):
|
|
35
|
+
with self.assertRaises(SystemExit) as ctx:
|
|
36
|
+
parse_params(['worker.py', '{"device_id": "7"}'], required=('device_id', 'output_dir'))
|
|
37
|
+
self.assertEqual(ctx.exception.code, 'Missing required parameters: output_dir')
|
|
38
|
+
|
|
39
|
+
def test_required_empty_value_counts_as_missing(self):
|
|
40
|
+
with self.assertRaises(SystemExit) as ctx:
|
|
41
|
+
parse_params(['worker.py', '{"device_id": ""}'], required=('device_id',))
|
|
42
|
+
self.assertEqual(ctx.exception.code, 'Missing required parameters: device_id')
|
|
43
|
+
|
|
44
|
+
def test_required_falsy_but_present_value_is_kept(self):
|
|
45
|
+
argv = ['worker.py', '{"retries": 0, "verbose": false}']
|
|
46
|
+
params = parse_params(argv, required=('retries', 'verbose'))
|
|
47
|
+
self.assertEqual(params, {'retries': 0, 'verbose': False})
|
|
48
|
+
|
|
49
|
+
def test_defaults_to_sys_argv(self):
|
|
50
|
+
with patch('sys.argv', ['worker.py', '{"interval": 5}']):
|
|
51
|
+
self.assertEqual(parse_params(), {'interval': 5})
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import time
|
|
3
|
-
from datetime import datetime, timedelta, timezone
|
|
4
|
-
|
|
5
|
-
from ..database.schema import DesiredStatus, Worker, WorkerStatus
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
logger = logging.getLogger('crazy_workers')
|
|
9
|
-
|
|
10
|
-
_BACKOFF_BASE_SECONDS = 1
|
|
11
|
-
_BACKOFF_MAX_SECONDS = 60
|
|
12
|
-
# Cap the exponent so a long-crashed worker doesn't compute an astronomically
|
|
13
|
-
# large intermediate before min() clamps it.
|
|
14
|
-
_BACKOFF_MAX_EXPONENT = 16
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class Reconciler:
|
|
18
|
-
"""Single-owner loop: drives actual worker state toward desired state.
|
|
19
|
-
|
|
20
|
-
Owns every worker process for one workers_dir/DB. Clients never spawn; they
|
|
21
|
-
only set desired_status in the shared DB and this loop makes it so.
|
|
22
|
-
|
|
23
|
-
| desired | alive | action |
|
|
24
|
-
|----------|-------|-------------------------------------|
|
|
25
|
-
| RUNNING | no | start (skipped while in backoff) |
|
|
26
|
-
| RUNNING | yes | noop (reconcile observed status) |
|
|
27
|
-
| STOPPED | yes | stop |
|
|
28
|
-
| STOPPED | no | noop |
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
def __init__(self, manager, interval=2.0):
|
|
32
|
-
self.manager = manager
|
|
33
|
-
self.interval = interval
|
|
34
|
-
self._stop = False
|
|
35
|
-
|
|
36
|
-
def run_forever(self):
|
|
37
|
-
logger.info('Reconciler started (interval=%ss)', self.interval)
|
|
38
|
-
while not self._stop:
|
|
39
|
-
try:
|
|
40
|
-
self.reconcile_once()
|
|
41
|
-
except Exception:
|
|
42
|
-
logger.exception('Reconcile pass failed; continuing.')
|
|
43
|
-
# Sleep in small slices so a SIGTERM-triggered stop is honoured promptly
|
|
44
|
-
# instead of after a full interval.
|
|
45
|
-
self._interruptible_sleep(self.interval)
|
|
46
|
-
logger.info('Reconciler stopped.')
|
|
47
|
-
|
|
48
|
-
def stop(self):
|
|
49
|
-
self._stop = True
|
|
50
|
-
|
|
51
|
-
def _interruptible_sleep(self, seconds):
|
|
52
|
-
deadline = time.monotonic() + seconds
|
|
53
|
-
while not self._stop and time.monotonic() < deadline:
|
|
54
|
-
time.sleep(min(0.2, deadline - time.monotonic()))
|
|
55
|
-
|
|
56
|
-
def reconcile_once(self):
|
|
57
|
-
"""One pass over every worker. Returns the actions taken (for tests/observability)."""
|
|
58
|
-
actions = []
|
|
59
|
-
for row in self._load_snapshot():
|
|
60
|
-
action = self._reconcile_worker(row)
|
|
61
|
-
if action:
|
|
62
|
-
actions.append((row['worker_key'], action))
|
|
63
|
-
return actions
|
|
64
|
-
|
|
65
|
-
def _load_snapshot(self):
|
|
66
|
-
# Read everything we need into plain dicts and release the session before
|
|
67
|
-
# touching processes — start/stop open their own short-lived sessions.
|
|
68
|
-
with self.manager.storage.session_scope() as session:
|
|
69
|
-
return [
|
|
70
|
-
{
|
|
71
|
-
'worker_key': w.worker_key,
|
|
72
|
-
'worker_type': w.worker_type,
|
|
73
|
-
'parameters': w.parameters,
|
|
74
|
-
'pid': w.pid,
|
|
75
|
-
'desired': w.desired_status,
|
|
76
|
-
'status': w.status,
|
|
77
|
-
'restart_count': w.restart_count,
|
|
78
|
-
'last_exit_at': w.last_exit_at,
|
|
79
|
-
}
|
|
80
|
-
for w in session.query(Worker).all()
|
|
81
|
-
]
|
|
82
|
-
|
|
83
|
-
def _reconcile_worker(self, row):
|
|
84
|
-
alive = self.manager.backend.is_alive(pid=row['pid'], worker_key=row['worker_key'])
|
|
85
|
-
|
|
86
|
-
if row['desired'] == DesiredStatus.RUNNING and not alive:
|
|
87
|
-
if self._in_backoff(row):
|
|
88
|
-
return None
|
|
89
|
-
logger.info('Reconcile: starting %s', row['worker_key'])
|
|
90
|
-
self.manager.start_worker(row['worker_type'], row['worker_key'], row['parameters'])
|
|
91
|
-
return 'start'
|
|
92
|
-
if row['desired'] == DesiredStatus.STOPPED and alive:
|
|
93
|
-
logger.info('Reconcile: stopping %s', row['worker_key'])
|
|
94
|
-
self.manager.stop_worker(row['worker_key'])
|
|
95
|
-
return 'stop'
|
|
96
|
-
if row['desired'] == DesiredStatus.RUNNING and alive and row['status'] != WorkerStatus.RUNNING:
|
|
97
|
-
# Process is up but the observed status drifted (e.g. left STARTING). Heal it.
|
|
98
|
-
self._mark_running(row['worker_key'])
|
|
99
|
-
return 'mark_running'
|
|
100
|
-
return None
|
|
101
|
-
|
|
102
|
-
def _in_backoff(self, row):
|
|
103
|
-
if not row['last_exit_at'] or row['status'] != WorkerStatus.CRASHED:
|
|
104
|
-
return False
|
|
105
|
-
exponent = min(row['restart_count'], _BACKOFF_MAX_EXPONENT)
|
|
106
|
-
delay = min(_BACKOFF_BASE_SECONDS * (2**exponent), _BACKOFF_MAX_SECONDS)
|
|
107
|
-
last_exit = row['last_exit_at']
|
|
108
|
-
# last_exit_at is stored as UTC wall-clock; coerce naive values read back
|
|
109
|
-
# from the DB to aware UTC so the comparison never mixes naive and aware.
|
|
110
|
-
if last_exit.tzinfo is None:
|
|
111
|
-
last_exit = last_exit.replace(tzinfo=timezone.utc)
|
|
112
|
-
return datetime.now(timezone.utc) < last_exit + timedelta(seconds=delay)
|
|
113
|
-
|
|
114
|
-
def _mark_running(self, worker_key):
|
|
115
|
-
with self.manager.storage.session_scope() as session:
|
|
116
|
-
worker = session.query(Worker).filter_by(worker_key=worker_key).first()
|
|
117
|
-
if worker:
|
|
118
|
-
worker.status = WorkerStatus.RUNNING
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|