crazy-workers 1.4.2__tar.gz → 1.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/PKG-INFO +21 -6
  2. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/README.md +20 -5
  3. crazy_workers-1.5.1/crazy_workers/__init__.py +24 -0
  4. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/status.py +22 -2
  5. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/engine.py +76 -0
  6. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/__init__.py +2 -2
  7. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/starter.py +10 -5
  8. crazy_workers-1.5.1/crazy_workers/daemon/reconciler.py +181 -0
  9. crazy_workers-1.5.1/crazy_workers/params.py +45 -0
  10. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/PKG-INFO +21 -6
  11. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/SOURCES.txt +3 -1
  12. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/pyproject.toml +2 -2
  13. crazy_workers-1.5.1/tests/test_params.py +51 -0
  14. crazy_workers-1.4.2/crazy_workers/__init__.py +0 -6
  15. crazy_workers-1.4.2/crazy_workers/daemon/reconciler.py +0 -118
  16. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/LICENSE +0 -0
  17. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/_bootstrap.py +0 -0
  18. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/__init__.py +0 -0
  19. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/__main__.py +0 -0
  20. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/base.py +0 -0
  21. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/detect.py +0 -0
  22. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/entry.py +0 -0
  23. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/orchestrator.py +0 -0
  24. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/systemd.py +0 -0
  25. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/boot/windows.py +0 -0
  26. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/__init__.py +0 -0
  27. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/__init__.py +0 -0
  28. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/params.py +0 -0
  29. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/starter.py +0 -0
  30. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/commands/stopper.py +0 -0
  31. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/discovery.py +0 -0
  32. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/main.py +0 -0
  33. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/cli/ui.py +0 -0
  34. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/client.py +0 -0
  35. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/__init__.py +0 -0
  36. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/backend.py +0 -0
  37. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/lister.py +0 -0
  38. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/recoverer.py +0 -0
  39. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/manager/stopper.py +0 -0
  40. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/core/recovery.py +0 -0
  41. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/__init__.py +0 -0
  42. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/__main__.py +0 -0
  43. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/daemon/runner.py +0 -0
  44. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/__init__.py +0 -0
  45. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/schema.py +0 -0
  46. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/database/storage.py +0 -0
  47. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/__init__.py +0 -0
  48. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/backends.py +0 -0
  49. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers/testing/polling.py +0 -0
  50. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/dependency_links.txt +0 -0
  51. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/entry_points.txt +0 -0
  52. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/requires.txt +0 -0
  53. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/crazy_workers.egg-info/top_level.txt +0 -0
  54. {crazy_workers-1.4.2 → crazy_workers-1.5.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crazy-workers
3
- Version: 1.4.2
3
+ Version: 1.5.1
4
4
  Summary: A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI.
5
5
  Author: GioVanni Colasanto
6
6
  License: MIT
@@ -76,9 +76,11 @@ pip install .
76
76
 
77
77
  ```python
78
78
  # workers/my_worker.py
79
- import json, sys, time
79
+ import time
80
80
 
81
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
81
+ from crazy_workers import parse_params
82
+
83
+ params = parse_params()
82
84
  duration = params.get('duration', 60)
83
85
 
84
86
  for _ in range(duration):
@@ -176,15 +178,28 @@ Closes the database connection and clears internal process references. Does **no
176
178
 
177
179
  ## Worker Script Contract
178
180
 
179
- A worker receives its parameters as a JSON string in `sys.argv[1]`:
181
+ A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
182
+ `parse_params` instead of decoding it by hand:
180
183
 
181
184
  ```python
182
- import json, sys
185
+ from crazy_workers import parse_params
183
186
 
184
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
187
+ params = parse_params()
185
188
  # ... do work ...
186
189
  ```
187
190
 
191
+ Without arguments it is lenient: a worker launched with no parameters gets
192
+ `{}`. Pass `required=` to abort before any real work when a parameter the
193
+ worker cannot run without is missing or empty — the worker exits with code 1
194
+ and a message on stderr:
195
+
196
+ ```python
197
+ params = parse_params(required=('device_id', 'output_dir'))
198
+ ```
199
+
200
+ `parse_params(argv=...)` accepts an explicit argv list, which makes the
201
+ parsing trivially testable without patching `sys.argv`.
202
+
188
203
  A worker is a separate process, so it cannot be handed a live object (e.g. a DB
189
204
  connection). Pass **configuration** instead: the manager's `worker_env` (and any
190
205
  per-call `env`) is injected as environment variables, so a worker reads, say,
@@ -41,9 +41,11 @@ pip install .
41
41
 
42
42
  ```python
43
43
  # workers/my_worker.py
44
- import json, sys, time
44
+ import time
45
45
 
46
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
46
+ from crazy_workers import parse_params
47
+
48
+ params = parse_params()
47
49
  duration = params.get('duration', 60)
48
50
 
49
51
  for _ in range(duration):
@@ -141,15 +143,28 @@ Closes the database connection and clears internal process references. Does **no
141
143
 
142
144
  ## Worker Script Contract
143
145
 
144
- A worker receives its parameters as a JSON string in `sys.argv[1]`:
146
+ A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
147
+ `parse_params` instead of decoding it by hand:
145
148
 
146
149
  ```python
147
- import json, sys
150
+ from crazy_workers import parse_params
148
151
 
149
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
152
+ params = parse_params()
150
153
  # ... do work ...
151
154
  ```
152
155
 
156
+ Without arguments it is lenient: a worker launched with no parameters gets
157
+ `{}`. Pass `required=` to abort before any real work when a parameter the
158
+ worker cannot run without is missing or empty — the worker exits with code 1
159
+ and a message on stderr:
160
+
161
+ ```python
162
+ params = parse_params(required=('device_id', 'output_dir'))
163
+ ```
164
+
165
+ `parse_params(argv=...)` accepts an explicit argv list, which makes the
166
+ parsing trivially testable without patching `sys.argv`.
167
+
153
168
  A worker is a separate process, so it cannot be handed a live object (e.g. a DB
154
169
  connection). Pass **configuration** instead: the manager's `worker_env` (and any
155
170
  per-call `env`) is injected as environment variables, so a worker reads, say,
@@ -0,0 +1,24 @@
1
+ __all__ = ['WorkerManager', 'WorkerClient', 'WorkerStatus', 'DesiredStatus', 'parse_params']
2
+
3
+
4
+ # Resolve exports lazily (PEP 562) so a worker doing `from crazy_workers import
5
+ # parse_params` only imports the lightweight params module — not the control
6
+ # plane (WorkerClient/WorkerManager) and its heavy SQLAlchemy dependencies.
7
+ def __getattr__(name):
8
+ if name == 'parse_params':
9
+ from .params import parse_params
10
+
11
+ return parse_params
12
+ if name == 'WorkerManager':
13
+ from .core.manager import WorkerManager
14
+
15
+ return WorkerManager
16
+ if name == 'WorkerClient':
17
+ from .client import WorkerClient
18
+
19
+ return WorkerClient
20
+ if name in ('DesiredStatus', 'WorkerStatus'):
21
+ from .database import schema
22
+
23
+ return getattr(schema, name)
24
+ raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
@@ -6,12 +6,13 @@ from datetime import datetime
6
6
  from rich.panel import Panel
7
7
  from rich.table import Table
8
8
 
9
+ from ...core.engine import resolve_system_pid
9
10
  from ..ui import console
10
11
 
11
12
 
12
13
  def show_status(client, workers_dir, json_mode=False):
13
14
  """Observability hub: the target state store plus the worker table (desired vs actual)."""
14
- workers = _merge_with_filesystem(client.list(), workers_dir)
15
+ workers = _with_system_pids(_merge_with_filesystem(client.list(), workers_dir))
15
16
 
16
17
  if json_mode:
17
18
  sys.stdout.write(json.dumps({'workers': workers}) + '\n')
@@ -70,6 +71,25 @@ def _merge_with_filesystem(db_workers, workers_dir):
70
71
  return results
71
72
 
72
73
 
74
+ def _with_system_pids(workers):
75
+ results = []
76
+ for worker in workers:
77
+ enriched = dict(worker)
78
+ enriched['system_pid'] = resolve_system_pid(enriched.get('pid'), worker_key=enriched.get('worker_key'))
79
+ results.append(enriched)
80
+ return results
81
+
82
+
83
+ def _format_pid(worker):
84
+ pid = worker.get('pid')
85
+ system_pid = worker.get('system_pid')
86
+ if not pid:
87
+ return '-'
88
+ if system_pid and system_pid != pid:
89
+ return f'{system_pid} [dim](ns {pid})[/dim]'
90
+ return str(pid)
91
+
92
+
73
93
  def _build_table(workers):
74
94
  table = Table(
75
95
  title='[bold cyan]Workers — desired vs actual[/bold cyan]', border_style='cyan', header_style='bold magenta'
@@ -114,7 +134,7 @@ def _build_table(workers):
114
134
  w['worker_type'],
115
135
  f'[{desired_style}]{desired}[/{desired_style}]',
116
136
  f'[{status_style}]{status}[/{status_style}]',
117
- str(w['pid']) if w['pid'] else '-',
137
+ _format_pid(w),
118
138
  last_action,
119
139
  params_str,
120
140
  )
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  import psutil
3
4
  import subprocess
4
5
 
@@ -64,6 +65,81 @@ def is_process_running(pid):
64
65
  return False
65
66
 
66
67
 
68
+ def resolve_system_pid(pid, worker_key=None):
69
+ """Return the most-native PID visible for ``pid``.
70
+
71
+ ``pid`` remains the control PID used by the current process namespace. On
72
+ Linux, ``CRAZY_WORKERS_HOST_PROC`` can point at a read-only host procfs mount
73
+ (for example /host/proc in Docker) so status can show the host PID. Without
74
+ that mount, /proc exposes NSpid when PID namespaces are visible; its first
75
+ value is the PID in the outermost namespace visible to this procfs mount. On
76
+ Windows and ordinary Linux hosts this is just ``pid``.
77
+ """
78
+ if pid is None:
79
+ return None
80
+ if os.name != 'posix':
81
+ return pid
82
+
83
+ host_pid = _resolve_from_host_proc(pid, worker_key=worker_key)
84
+ if host_pid is not None:
85
+ return host_pid
86
+
87
+ try:
88
+ with open(f'/proc/{pid}/status', encoding='utf-8') as f:
89
+ for line in f:
90
+ if line.startswith('NSpid:'):
91
+ values = [int(value) for value in line.split()[1:]]
92
+ return values[0] if values else pid
93
+ except (OSError, ValueError):
94
+ return pid
95
+
96
+ return pid
97
+
98
+
99
+ def _resolve_from_host_proc(pid, worker_key=None):
100
+ host_proc = os.environ.get('CRAZY_WORKERS_HOST_PROC')
101
+ if not host_proc:
102
+ return None
103
+ if not os.path.isdir(host_proc):
104
+ return None
105
+
106
+ for entry in os.listdir(host_proc):
107
+ if not entry.isdigit():
108
+ continue
109
+
110
+ status_path = os.path.join(host_proc, entry, 'status')
111
+ try:
112
+ with open(status_path, encoding='utf-8') as f:
113
+ nspid = _read_nspid(f)
114
+ except (OSError, ValueError):
115
+ continue
116
+
117
+ if not nspid or nspid[-1] != pid:
118
+ continue
119
+ if worker_key and not _host_proc_cmdline_matches(host_proc, entry, worker_key):
120
+ continue
121
+ return nspid[0]
122
+
123
+ return None
124
+
125
+
126
+ def _read_nspid(lines):
127
+ for line in lines:
128
+ if line.startswith('NSpid:'):
129
+ return [int(value) for value in line.split()[1:]]
130
+ return None
131
+
132
+
133
+ def _host_proc_cmdline_matches(host_proc, host_pid, worker_key):
134
+ try:
135
+ with open(os.path.join(host_proc, host_pid, 'cmdline'), 'rb') as f:
136
+ raw = f.read()
137
+ except OSError:
138
+ return False
139
+ parts = [part.decode(errors='ignore') for part in raw.split(b'\0') if part]
140
+ return worker_key_token(worker_key) in parts
141
+
142
+
67
143
  def terminate_process(pid, timeout=5, popen_process=None, exclude_pids=None):
68
144
  """Gracefully terminates a process and its non-managed children.
69
145
 
@@ -107,8 +107,8 @@ class WorkerManager:
107
107
  else:
108
108
  self.storage = None
109
109
 
110
- def start_worker(self, worker_type, worker_key=None, parameters=None, env=None):
111
- return start_worker(self, worker_type, worker_key, parameters, env)
110
+ def start_worker(self, worker_type, worker_key=None, parameters=None, env=None, reset_backoff=True):
111
+ return start_worker(self, worker_type, worker_key, parameters, env, reset_backoff)
112
112
 
113
113
  def stop_worker(self, worker_key):
114
114
  return stop_worker(self, worker_key)
@@ -19,7 +19,7 @@ logger = logging.getLogger('crazy_workers')
19
19
  _SAFE_NAME = re.compile(r'[A-Za-z0-9_-]+')
20
20
 
21
21
 
22
- def start_worker(manager, worker_type, worker_key=None, parameters=None, env=None):
22
+ def start_worker(manager, worker_type, worker_key=None, parameters=None, env=None, reset_backoff=True):
23
23
  if not manager.storage:
24
24
  return False, 'System not initialized (database missing)'
25
25
 
@@ -43,7 +43,7 @@ def start_worker(manager, worker_type, worker_key=None, parameters=None, env=Non
43
43
  worker.status = WorkerStatus.STOPPED
44
44
  return False, f'Worker file {worker_type}.py not found'
45
45
 
46
- return _spawn_worker_process(manager, worker, worker_path, parameters, env, session)
46
+ return _spawn_worker_process(manager, worker, worker_path, parameters, env, session, reset_backoff)
47
47
 
48
48
 
49
49
  def _validate_inputs(worker_type, worker_key):
@@ -107,7 +107,7 @@ def _get_worker_script_path(manager, worker_type):
107
107
  return worker_path
108
108
 
109
109
 
110
- def _spawn_worker_process(manager, worker, worker_path, parameters, env, session):
110
+ def _spawn_worker_process(manager, worker, worker_path, parameters, env, session, reset_backoff=True):
111
111
  log_file_path = os.path.join(manager.logs_dir, f'{worker.worker_key}.log')
112
112
 
113
113
  # The manager's worker_env is the baseline for every worker (e.g. the host
@@ -132,8 +132,13 @@ def _spawn_worker_process(manager, worker, worker_path, parameters, env, session
132
132
  worker.pid = handle.pid
133
133
  worker.status = WorkerStatus.RUNNING
134
134
  worker.last_started_at = func.now()
135
- # A clean start clears the crash backoff so the next failure starts over.
136
- worker.restart_count = 0
135
+ # A clean start (CLI/client/manual restart) clears the crash backoff so the
136
+ # next failure starts over. An automatic reconciler restart passes
137
+ # reset_backoff=False: a fast crash-loop must keep accumulating restart_count
138
+ # across respawns, otherwise the exponential backoff can never escalate. The
139
+ # reconciler instead resets the counter once the worker is observed healthy.
140
+ if reset_backoff:
141
+ worker.restart_count = 0
137
142
  session.commit()
138
143
 
139
144
  manager._active_processes[worker.worker_key] = handle
@@ -0,0 +1,181 @@
1
+ import logging
2
+ import time
3
+ from datetime import datetime, timedelta, timezone
4
+
5
+ from ..database.schema import DesiredStatus, Worker, WorkerStatus
6
+
7
+
8
+ logger = logging.getLogger('crazy_workers')
9
+
10
+ _BACKOFF_BASE_SECONDS = 1
11
+ _BACKOFF_MAX_SECONDS = 60
12
+ # Cap the exponent so a long-crashed worker doesn't compute an astronomically
13
+ # large intermediate before min() clamps it.
14
+ _BACKOFF_MAX_EXPONENT = 16
15
+
16
+
17
+ class Reconciler:
18
+ """Single-owner loop: drives actual worker state toward desired state.
19
+
20
+ Owns every worker process for one workers_dir/DB. Clients never spawn; they
21
+ only set desired_status in the shared DB and this loop makes it so.
22
+
23
+ | desired | alive | observed | action |
24
+ |----------|-------|-----------------|--------------------------------------------|
25
+ | RUNNING | no | RUNNING | record crash (mark CRASHED, +restart_count)|
26
+ | RUNNING | no | other | start (skipped while in backoff) |
27
+ | RUNNING | yes | RUNNING | noop (reset backoff once proven healthy) |
28
+ | RUNNING | yes | other | heal observed status to RUNNING |
29
+ | STOPPED | yes | - | stop |
30
+ | STOPPED | no | RUNNING/CRASHED | heal stale observed status to STOPPED |
31
+ | STOPPED | no | STOPPED | noop |
32
+
33
+ Crash detection is split from the restart on purpose. A worker that dies the
34
+ instant it starts comes back from ``start_worker`` marked RUNNING; if the same
35
+ pass that noticed the dead PID respawned it, its status would flip to RUNNING
36
+ again before the backoff branch ever saw CRASHED, and an instantly-crashing
37
+ worker would respawn at full loop speed forever. Instead the first pass records
38
+ the death (CRASHED + a bumped restart_count), and a later pass restarts it once
39
+ the exponential backoff has elapsed.
40
+ """
41
+
42
+ def __init__(self, manager, interval=2.0):
43
+ self.manager = manager
44
+ self.interval = interval
45
+ self._stop = False
46
+
47
+ def run_forever(self):
48
+ logger.info('Reconciler started (interval=%ss)', self.interval)
49
+ while not self._stop:
50
+ try:
51
+ self.reconcile_once()
52
+ except Exception:
53
+ logger.exception('Reconcile pass failed; continuing.')
54
+ # Sleep in small slices so a SIGTERM-triggered stop is honoured promptly
55
+ # instead of after a full interval.
56
+ self._interruptible_sleep(self.interval)
57
+ logger.info('Reconciler stopped.')
58
+
59
+ def stop(self):
60
+ self._stop = True
61
+
62
+ def _interruptible_sleep(self, seconds):
63
+ deadline = time.monotonic() + seconds
64
+ while not self._stop and time.monotonic() < deadline:
65
+ time.sleep(min(0.2, deadline - time.monotonic()))
66
+
67
+ def reconcile_once(self):
68
+ """One pass over every worker. Returns the actions taken (for tests/observability)."""
69
+ actions = []
70
+ for row in self._load_snapshot():
71
+ action = self._reconcile_worker(row)
72
+ if action:
73
+ actions.append((row['worker_key'], action))
74
+ return actions
75
+
76
+ def _load_snapshot(self):
77
+ # Read everything we need into plain dicts and release the session before
78
+ # touching processes — start/stop open their own short-lived sessions.
79
+ with self.manager.storage.session_scope() as session:
80
+ return [
81
+ {
82
+ 'worker_key': w.worker_key,
83
+ 'worker_type': w.worker_type,
84
+ 'parameters': w.parameters,
85
+ 'pid': w.pid,
86
+ 'desired': w.desired_status,
87
+ 'status': w.status,
88
+ 'restart_count': w.restart_count,
89
+ 'last_exit_at': w.last_exit_at,
90
+ }
91
+ for w in session.query(Worker).all()
92
+ ]
93
+
94
+ def _reconcile_worker(self, row):
95
+ alive = self.manager.backend.is_alive(pid=row['pid'], worker_key=row['worker_key'])
96
+
97
+ if row['desired'] == DesiredStatus.RUNNING and not alive:
98
+ if row['status'] == WorkerStatus.RUNNING:
99
+ # We last saw it RUNNING but the process is gone: it crashed. Record the
100
+ # death now and let a later pass restart it — see the class docstring for
101
+ # why detection and restart are split across passes.
102
+ logger.warning('Reconcile: worker %s died; recording crash', row['worker_key'])
103
+ self._record_crash(row['worker_key'])
104
+ return 'crashed'
105
+ if self._in_backoff(row):
106
+ return None
107
+ logger.info('Reconcile: starting %s', row['worker_key'])
108
+ # reset_backoff=False: an automatic restart must not wipe the crash history
109
+ # a fast crash-loop is accumulating, or the backoff could never escalate.
110
+ self.manager.start_worker(row['worker_type'], row['worker_key'], row['parameters'], reset_backoff=False)
111
+ return 'start'
112
+ if row['desired'] == DesiredStatus.STOPPED:
113
+ if alive:
114
+ logger.info('Reconcile: stopping %s', row['worker_key'])
115
+ self.manager.stop_worker(row['worker_key'])
116
+ return 'stop'
117
+ if row['status'] not in (WorkerStatus.STOPPED, WorkerStatus.NEVER_STARTED):
118
+ # Desired down and the process is already gone, but the observed status is
119
+ # stale — RUNNING left by the last spawn, or CRASHED. Nothing kills the
120
+ # process (it is dead) and stop_worker only handles a RUNNING worker, so
121
+ # converge the observed status here, or the table shows a phantom
122
+ # RUNNING/CRASHED with a dead PID forever.
123
+ self._mark_stopped(row['worker_key'])
124
+ return 'mark_stopped'
125
+ return None
126
+ if row['desired'] == DesiredStatus.RUNNING and alive:
127
+ if row['status'] != WorkerStatus.RUNNING:
128
+ # Process is up but the observed status drifted (e.g. left STARTING). Heal it.
129
+ self._mark_running(row['worker_key'])
130
+ return 'mark_running'
131
+ if row['restart_count']:
132
+ # Alive a full pass after its last spawn — proven healthy. Clear the crash
133
+ # backoff so a future crash starts counting (and backing off) from zero.
134
+ self._reset_backoff(row['worker_key'])
135
+ return 'reset_backoff'
136
+ return None
137
+
138
+ def _in_backoff(self, row):
139
+ if not row['last_exit_at'] or row['status'] != WorkerStatus.CRASHED:
140
+ return False
141
+ exponent = min(row['restart_count'], _BACKOFF_MAX_EXPONENT)
142
+ delay = min(_BACKOFF_BASE_SECONDS * (2**exponent), _BACKOFF_MAX_SECONDS)
143
+ last_exit = row['last_exit_at']
144
+ # last_exit_at is stored as UTC wall-clock; coerce naive values read back
145
+ # from the DB to aware UTC so the comparison never mixes naive and aware.
146
+ if last_exit.tzinfo is None:
147
+ last_exit = last_exit.replace(tzinfo=timezone.utc)
148
+ return datetime.now(timezone.utc) < last_exit + timedelta(seconds=delay)
149
+
150
+ def _mark_running(self, worker_key):
151
+ with self.manager.storage.session_scope() as session:
152
+ worker = session.query(Worker).filter_by(worker_key=worker_key).first()
153
+ if worker:
154
+ worker.status = WorkerStatus.RUNNING
155
+
156
+ def _record_crash(self, worker_key):
157
+ # Persist the death so the next pass's backoff gate can see it: mark CRASHED,
158
+ # bump the restart counter and stamp the exit time in UTC (Python-side, so the
159
+ # backoff math does not depend on the DB dialect's now()/timezone semantics).
160
+ # pid is cleared so we stop probing a dead (and potentially reused) PID.
161
+ with self.manager.storage.session_scope() as session:
162
+ worker = session.query(Worker).filter_by(worker_key=worker_key).first()
163
+ if worker:
164
+ worker.status = WorkerStatus.CRASHED
165
+ worker.last_exit_at = datetime.now(timezone.utc)
166
+ worker.restart_count = (worker.restart_count or 0) + 1
167
+ worker.pid = None
168
+
169
+ def _reset_backoff(self, worker_key):
170
+ with self.manager.storage.session_scope() as session:
171
+ worker = session.query(Worker).filter_by(worker_key=worker_key).first()
172
+ if worker:
173
+ worker.restart_count = 0
174
+
175
+ def _mark_stopped(self, worker_key):
176
+ with self.manager.storage.session_scope() as session:
177
+ worker = session.query(Worker).filter_by(worker_key=worker_key).first()
178
+ if worker:
179
+ worker.status = WorkerStatus.STOPPED
180
+ worker.pid = None
181
+ worker.last_stopped_at = datetime.now(timezone.utc)
@@ -0,0 +1,45 @@
1
+ """Worker-side parsing of the JSON parameters the daemon passes on argv.
2
+
3
+ The manager spawns every worker as ``python -m crazy_workers._bootstrap
4
+ <worker_path> <json_params>`` and the bootstrap restores ``sys.argv`` to
5
+ ``[worker_path, json_params]``. This module is the worker-side counterpart of
6
+ that contract: call :func:`parse_params` at the top of a worker's ``main()``
7
+ instead of re-implementing the argv/JSON boilerplate in every worker script.
8
+ """
9
+
10
+ import json
11
+ import sys
12
+
13
+
14
+ def parse_params(argv=None, required=()):
15
+ """Return the parameters dict the daemon passed to this worker.
16
+
17
+ Without ``required`` a worker launched with no parameters gets ``{}`` — the
18
+ lenient mode for workers whose parameters are all optional. With ``required``
19
+ a missing argv, malformed JSON, or a missing/empty required key aborts the
20
+ worker with a message on stderr (``SystemExit``, exit code 1) before it does
21
+ any real work. A key counts as missing when it is absent, ``None``, or an
22
+ empty string; a falsy-but-present value like ``0`` or ``False`` is kept.
23
+
24
+ ``argv`` defaults to ``sys.argv``; pass a list explicitly in tests.
25
+ """
26
+ argv = sys.argv if argv is None else argv
27
+
28
+ if len(argv) < 2:
29
+ if required:
30
+ raise SystemExit(f'Missing required parameters: {", ".join(required)}')
31
+ return {}
32
+
33
+ try:
34
+ params = json.loads(argv[1])
35
+ except json.JSONDecodeError as error:
36
+ raise SystemExit(f'Invalid JSON parameters: {error}')
37
+
38
+ if not isinstance(params, dict):
39
+ raise SystemExit('Invalid JSON parameters: expected a JSON object')
40
+
41
+ missing = [key for key in required if key not in params or params[key] in (None, '')]
42
+ if missing:
43
+ raise SystemExit(f'Missing required parameters: {", ".join(missing)}')
44
+
45
+ return params
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crazy-workers
3
- Version: 1.4.2
3
+ Version: 1.5.1
4
4
  Summary: A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI.
5
5
  Author: GioVanni Colasanto
6
6
  License: MIT
@@ -76,9 +76,11 @@ pip install .
76
76
 
77
77
  ```python
78
78
  # workers/my_worker.py
79
- import json, sys, time
79
+ import time
80
80
 
81
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
81
+ from crazy_workers import parse_params
82
+
83
+ params = parse_params()
82
84
  duration = params.get('duration', 60)
83
85
 
84
86
  for _ in range(duration):
@@ -176,15 +178,28 @@ Closes the database connection and clears internal process references. Does **no
176
178
 
177
179
  ## Worker Script Contract
178
180
 
179
- A worker receives its parameters as a JSON string in `sys.argv[1]`:
181
+ A worker receives its parameters as a JSON string in `sys.argv[1]`. Use
182
+ `parse_params` instead of decoding it by hand:
180
183
 
181
184
  ```python
182
- import json, sys
185
+ from crazy_workers import parse_params
183
186
 
184
- params = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
187
+ params = parse_params()
185
188
  # ... do work ...
186
189
  ```
187
190
 
191
+ Without arguments it is lenient: a worker launched with no parameters gets
192
+ `{}`. Pass `required=` to abort before any real work when a parameter the
193
+ worker cannot run without is missing or empty — the worker exits with code 1
194
+ and a message on stderr:
195
+
196
+ ```python
197
+ params = parse_params(required=('device_id', 'output_dir'))
198
+ ```
199
+
200
+ `parse_params(argv=...)` accepts an explicit argv list, which makes the
201
+ parsing trivially testable without patching `sys.argv`.
202
+
188
203
  A worker is a separate process, so it cannot be handed a live object (e.g. a DB
189
204
  connection). Pass **configuration** instead: the manager's `worker_env` (and any
190
205
  per-call `env`) is injected as environment variables, so a worker reads, say,
@@ -4,6 +4,7 @@ pyproject.toml
4
4
  crazy_workers/__init__.py
5
5
  crazy_workers/_bootstrap.py
6
6
  crazy_workers/client.py
7
+ crazy_workers/params.py
7
8
  crazy_workers.egg-info/PKG-INFO
8
9
  crazy_workers.egg-info/SOURCES.txt
9
10
  crazy_workers.egg-info/dependency_links.txt
@@ -45,4 +46,5 @@ crazy_workers/database/schema.py
45
46
  crazy_workers/database/storage.py
46
47
  crazy_workers/testing/__init__.py
47
48
  crazy_workers/testing/backends.py
48
- crazy_workers/testing/polling.py
49
+ crazy_workers/testing/polling.py
50
+ tests/test_params.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "crazy-workers"
3
- version = "1.4.2"
3
+ version = "1.5.1"
4
4
  description = "A Python library for managing background worker processes with persistent state, automatic recovery, and a CLI."
5
5
  readme = "README.md"
6
6
  authors = [{ name = "GioVanni Colasanto" }]
@@ -63,7 +63,7 @@ omit = ["*/venv/*", "*/build/*", "tests/*"]
63
63
  [tool.coverage.report]
64
64
  show_missing = true
65
65
  skip_covered = false
66
- fail_under = 95
66
+ fail_under = 90
67
67
 
68
68
  [tool.ruff]
69
69
  line-length = 120
@@ -0,0 +1,51 @@
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ from crazy_workers import parse_params
5
+
6
+
7
+ class TestParseParams(unittest.TestCase):
8
+ def test_no_argv_returns_empty_dict(self):
9
+ self.assertEqual(parse_params(['worker.py']), {})
10
+
11
+ def test_no_argv_with_required_exits(self):
12
+ with self.assertRaises(SystemExit) as ctx:
13
+ parse_params(['worker.py'], required=('device_id', 'output_dir'))
14
+ self.assertEqual(ctx.exception.code, 'Missing required parameters: device_id, output_dir')
15
+
16
+ def test_invalid_json_exits(self):
17
+ with self.assertRaises(SystemExit) as ctx:
18
+ parse_params(['worker.py', 'not json'])
19
+ self.assertIn('Invalid JSON parameters', str(ctx.exception.code))
20
+
21
+ def test_non_object_json_exits(self):
22
+ with self.assertRaises(SystemExit) as ctx:
23
+ parse_params(['worker.py', '[1, 2, 3]'])
24
+ self.assertEqual(ctx.exception.code, 'Invalid JSON parameters: expected a JSON object')
25
+
26
+ def test_returns_parsed_params(self):
27
+ self.assertEqual(parse_params(['worker.py', '{"device_id": "7"}']), {'device_id': '7'})
28
+
29
+ def test_required_present_returns_params(self):
30
+ argv = ['worker.py', '{"device_id": "7", "output_dir": "/out"}']
31
+ params = parse_params(argv, required=('device_id', 'output_dir'))
32
+ self.assertEqual(params, {'device_id': '7', 'output_dir': '/out'})
33
+
34
+ def test_required_missing_exits_naming_keys(self):
35
+ with self.assertRaises(SystemExit) as ctx:
36
+ parse_params(['worker.py', '{"device_id": "7"}'], required=('device_id', 'output_dir'))
37
+ self.assertEqual(ctx.exception.code, 'Missing required parameters: output_dir')
38
+
39
+ def test_required_empty_value_counts_as_missing(self):
40
+ with self.assertRaises(SystemExit) as ctx:
41
+ parse_params(['worker.py', '{"device_id": ""}'], required=('device_id',))
42
+ self.assertEqual(ctx.exception.code, 'Missing required parameters: device_id')
43
+
44
+ def test_required_falsy_but_present_value_is_kept(self):
45
+ argv = ['worker.py', '{"retries": 0, "verbose": false}']
46
+ params = parse_params(argv, required=('retries', 'verbose'))
47
+ self.assertEqual(params, {'retries': 0, 'verbose': False})
48
+
49
+ def test_defaults_to_sys_argv(self):
50
+ with patch('sys.argv', ['worker.py', '{"interval": 5}']):
51
+ self.assertEqual(parse_params(), {'interval': 5})
@@ -1,6 +0,0 @@
1
- from .client import WorkerClient
2
- from .core.manager import WorkerManager
3
- from .database.schema import DesiredStatus, WorkerStatus
4
-
5
-
6
- __all__ = ['WorkerManager', 'WorkerClient', 'WorkerStatus', 'DesiredStatus']
@@ -1,118 +0,0 @@
1
- import logging
2
- import time
3
- from datetime import datetime, timedelta, timezone
4
-
5
- from ..database.schema import DesiredStatus, Worker, WorkerStatus
6
-
7
-
8
- logger = logging.getLogger('crazy_workers')
9
-
10
- _BACKOFF_BASE_SECONDS = 1
11
- _BACKOFF_MAX_SECONDS = 60
12
- # Cap the exponent so a long-crashed worker doesn't compute an astronomically
13
- # large intermediate before min() clamps it.
14
- _BACKOFF_MAX_EXPONENT = 16
15
-
16
-
17
- class Reconciler:
18
- """Single-owner loop: drives actual worker state toward desired state.
19
-
20
- Owns every worker process for one workers_dir/DB. Clients never spawn; they
21
- only set desired_status in the shared DB and this loop makes it so.
22
-
23
- | desired | alive | action |
24
- |----------|-------|-------------------------------------|
25
- | RUNNING | no | start (skipped while in backoff) |
26
- | RUNNING | yes | noop (reconcile observed status) |
27
- | STOPPED | yes | stop |
28
- | STOPPED | no | noop |
29
- """
30
-
31
- def __init__(self, manager, interval=2.0):
32
- self.manager = manager
33
- self.interval = interval
34
- self._stop = False
35
-
36
- def run_forever(self):
37
- logger.info('Reconciler started (interval=%ss)', self.interval)
38
- while not self._stop:
39
- try:
40
- self.reconcile_once()
41
- except Exception:
42
- logger.exception('Reconcile pass failed; continuing.')
43
- # Sleep in small slices so a SIGTERM-triggered stop is honoured promptly
44
- # instead of after a full interval.
45
- self._interruptible_sleep(self.interval)
46
- logger.info('Reconciler stopped.')
47
-
48
- def stop(self):
49
- self._stop = True
50
-
51
- def _interruptible_sleep(self, seconds):
52
- deadline = time.monotonic() + seconds
53
- while not self._stop and time.monotonic() < deadline:
54
- time.sleep(min(0.2, deadline - time.monotonic()))
55
-
56
- def reconcile_once(self):
57
- """One pass over every worker. Returns the actions taken (for tests/observability)."""
58
- actions = []
59
- for row in self._load_snapshot():
60
- action = self._reconcile_worker(row)
61
- if action:
62
- actions.append((row['worker_key'], action))
63
- return actions
64
-
65
- def _load_snapshot(self):
66
- # Read everything we need into plain dicts and release the session before
67
- # touching processes — start/stop open their own short-lived sessions.
68
- with self.manager.storage.session_scope() as session:
69
- return [
70
- {
71
- 'worker_key': w.worker_key,
72
- 'worker_type': w.worker_type,
73
- 'parameters': w.parameters,
74
- 'pid': w.pid,
75
- 'desired': w.desired_status,
76
- 'status': w.status,
77
- 'restart_count': w.restart_count,
78
- 'last_exit_at': w.last_exit_at,
79
- }
80
- for w in session.query(Worker).all()
81
- ]
82
-
83
- def _reconcile_worker(self, row):
84
- alive = self.manager.backend.is_alive(pid=row['pid'], worker_key=row['worker_key'])
85
-
86
- if row['desired'] == DesiredStatus.RUNNING and not alive:
87
- if self._in_backoff(row):
88
- return None
89
- logger.info('Reconcile: starting %s', row['worker_key'])
90
- self.manager.start_worker(row['worker_type'], row['worker_key'], row['parameters'])
91
- return 'start'
92
- if row['desired'] == DesiredStatus.STOPPED and alive:
93
- logger.info('Reconcile: stopping %s', row['worker_key'])
94
- self.manager.stop_worker(row['worker_key'])
95
- return 'stop'
96
- if row['desired'] == DesiredStatus.RUNNING and alive and row['status'] != WorkerStatus.RUNNING:
97
- # Process is up but the observed status drifted (e.g. left STARTING). Heal it.
98
- self._mark_running(row['worker_key'])
99
- return 'mark_running'
100
- return None
101
-
102
- def _in_backoff(self, row):
103
- if not row['last_exit_at'] or row['status'] != WorkerStatus.CRASHED:
104
- return False
105
- exponent = min(row['restart_count'], _BACKOFF_MAX_EXPONENT)
106
- delay = min(_BACKOFF_BASE_SECONDS * (2**exponent), _BACKOFF_MAX_SECONDS)
107
- last_exit = row['last_exit_at']
108
- # last_exit_at is stored as UTC wall-clock; coerce naive values read back
109
- # from the DB to aware UTC so the comparison never mixes naive and aware.
110
- if last_exit.tzinfo is None:
111
- last_exit = last_exit.replace(tzinfo=timezone.utc)
112
- return datetime.now(timezone.utc) < last_exit + timedelta(seconds=delay)
113
-
114
- def _mark_running(self, worker_key):
115
- with self.manager.storage.session_scope() as session:
116
- worker = session.query(Worker).filter_by(worker_key=worker_key).first()
117
- if worker:
118
- worker.status = WorkerStatus.RUNNING
File without changes
File without changes