ox-proc 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ox_proc/__init__.py +45 -0
- ox_proc/core.py +448 -0
- ox_proc-0.1.0.dist-info/METADATA +129 -0
- ox_proc-0.1.0.dist-info/RECORD +5 -0
- ox_proc-0.1.0.dist-info/WHEEL +4 -0
ox_proc/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Launch, monitor, and clean up detached external processes.
|
|
2
|
+
|
|
3
|
+
See `ox_proc.core` for full documentation. The public API is
|
|
4
|
+
re-exported here so callers can simply ``import ox_proc``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ox_proc.core import (
|
|
8
|
+
DEFAULT_MAX_AGE_SECONDS,
|
|
9
|
+
DEFAULT_MAX_RUNTIME_SECONDS,
|
|
10
|
+
DEFAULT_TTL_SECONDS,
|
|
11
|
+
RUN_DIR_ENV_VAR,
|
|
12
|
+
OxProcError,
|
|
13
|
+
TooManyLiveError,
|
|
14
|
+
cleanup,
|
|
15
|
+
count_live,
|
|
16
|
+
default_base_dir,
|
|
17
|
+
generate_run_id,
|
|
18
|
+
get_info,
|
|
19
|
+
kill_run,
|
|
20
|
+
launch,
|
|
21
|
+
post_message,
|
|
22
|
+
run_dir_for,
|
|
23
|
+
write_result,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__version__ = '0.1.0'
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
'DEFAULT_MAX_AGE_SECONDS',
|
|
30
|
+
'DEFAULT_MAX_RUNTIME_SECONDS',
|
|
31
|
+
'DEFAULT_TTL_SECONDS',
|
|
32
|
+
'RUN_DIR_ENV_VAR',
|
|
33
|
+
'OxProcError',
|
|
34
|
+
'TooManyLiveError',
|
|
35
|
+
'cleanup',
|
|
36
|
+
'count_live',
|
|
37
|
+
'default_base_dir',
|
|
38
|
+
'generate_run_id',
|
|
39
|
+
'get_info',
|
|
40
|
+
'kill_run',
|
|
41
|
+
'launch',
|
|
42
|
+
'post_message',
|
|
43
|
+
'run_dir_for',
|
|
44
|
+
'write_result',
|
|
45
|
+
]
|
ox_proc/core.py
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
"""Launch, monitor, and clean up detached external processes.
|
|
2
|
+
|
|
3
|
+
The `ox_proc` package lets a short-lived caller (e.g., a web request
|
|
4
|
+
handler) launch a long-running command as a *detached* subprocess and
|
|
5
|
+
later inspect its progress from any other process, with no broker or
|
|
6
|
+
persistent worker required. All state lives on disk in a per-run
|
|
7
|
+
directory:
|
|
8
|
+
|
|
9
|
+
<base_dir>/<slug>/<run_id>/
|
|
10
|
+
status.json written by launcher; finalized by observers
|
|
11
|
+
messages.jsonl appended by the child via `post_message()`
|
|
12
|
+
result.json written by the child via `write_result()`
|
|
13
|
+
stdout.log child's stdout
|
|
14
|
+
stderr.log child's stderr
|
|
15
|
+
|
|
16
|
+
Conventions and caveats:
|
|
17
|
+
|
|
18
|
+
* The launched command should call `write_result()` on success. A
|
|
19
|
+
dead process with no ``result.json`` is reported as state "died".
|
|
20
|
+
* Because the child is detached, its true exit code and end time are
|
|
21
|
+
lost; observers record an *observed* end time when they first notice
|
|
22
|
+
the process is gone. TTL-based cleanup counts from that time.
|
|
23
|
+
* The default base directory lives under `tempfile.gettempdir()`, so
|
|
24
|
+
the operating system may purge it (e.g., on reboot or via tmpfiles
|
|
25
|
+
cleaning); run history is therefore not durable.
|
|
26
|
+
* Liveness checks verify both PID existence and the process creation
|
|
27
|
+
time recorded at launch, guarding against PID reuse.
|
|
28
|
+
|
|
29
|
+
This module is organized with the public API first and private
|
|
30
|
+
helpers (named with a leading underscore) gathered at the end.
|
|
31
|
+
|
|
32
|
+
Requires the `psutil` package.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import datetime
|
|
38
|
+
import getpass
|
|
39
|
+
import json
|
|
40
|
+
import os
|
|
41
|
+
import secrets
|
|
42
|
+
import subprocess
|
|
43
|
+
import tempfile
|
|
44
|
+
from typing import Any, Iterator, Optional, Sequence
|
|
45
|
+
|
|
46
|
+
import psutil
|
|
47
|
+
|
|
48
|
+
DEFAULT_TTL_SECONDS = 24 * 3600 # keep finished runs this long
|
|
49
|
+
DEFAULT_MAX_RUNTIME_SECONDS = 8 * 3600 # kill live runs after this
|
|
50
|
+
DEFAULT_MAX_AGE_SECONDS = 24 * 3600 # deletion backstop from launch
|
|
51
|
+
RUN_DIR_ENV_VAR = 'OX_PROC_RUN_DIR'
|
|
52
|
+
|
|
53
|
+
_SLUG_CHARS = set('abcdefghijklmnopqrstuvwxyz0123456789_-')
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OxProcError(Exception):
|
|
57
|
+
"""Base class for errors raised by this package."""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TooManyLiveError(OxProcError):
|
|
61
|
+
"""Raised when a launch would exceed the live-run limit for a slug."""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ----------------------------------------------------------------------
|
|
65
|
+
# Public API
|
|
66
|
+
# ----------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def default_base_dir() -> str:
|
|
69
|
+
"""Return the default base directory for run tracking.
|
|
70
|
+
|
|
71
|
+
Includes the username to avoid collisions between users sharing
|
|
72
|
+
the system temporary directory.
|
|
73
|
+
"""
|
|
74
|
+
user = getpass.getuser()
|
|
75
|
+
return os.path.join(tempfile.gettempdir(), f'ox_proc-{user}')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def generate_run_id(slug: str) -> str:
|
|
79
|
+
"""Return a new run ID of the form ``<slug>-<timestamp>-<rand>``."""
|
|
80
|
+
stamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
|
|
81
|
+
return f'{_sanitize_slug(slug)}-{stamp}-{secrets.token_hex(2)}'
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def run_dir_for(run_id: str, base_dir: Optional[str] = None) -> str:
|
|
85
|
+
"""Return the directory path holding files for `run_id`."""
|
|
86
|
+
base_dir = base_dir or default_base_dir()
|
|
87
|
+
return os.path.join(base_dir, _slug_from_run_id(run_id), run_id)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def launch(cmd: Sequence[str], slug: str,
|
|
91
|
+
description: Optional[str] = None,
|
|
92
|
+
env_updates: Optional[dict[str, str]] = None,
|
|
93
|
+
base_dir: Optional[str] = None,
|
|
94
|
+
ttl_seconds: float = DEFAULT_TTL_SECONDS,
|
|
95
|
+
max_runtime_seconds: Optional[float] =
|
|
96
|
+
DEFAULT_MAX_RUNTIME_SECONDS,
|
|
97
|
+
max_live: Optional[int] = None) -> str:
|
|
98
|
+
"""Launch `cmd` (a list of strings) as a detached subprocess.
|
|
99
|
+
|
|
100
|
+
The child is placed in its own session so it survives the caller's
|
|
101
|
+
exit; its stdout/stderr go to log files in the run directory, and
|
|
102
|
+
the environment variable named by `RUN_DIR_ENV_VAR` points the
|
|
103
|
+
child at that directory so it can use `post_message()` and
|
|
104
|
+
`write_result()`.
|
|
105
|
+
|
|
106
|
+
`env_updates`, if given, is merged into a copy of the caller's
|
|
107
|
+
environment. If `max_live` is given and at least that many runs
|
|
108
|
+
with this slug are still alive, `TooManyLiveError` is raised (a
|
|
109
|
+
small launch race is accepted; the limit is best-effort).
|
|
110
|
+
|
|
111
|
+
Returns the new run ID.
|
|
112
|
+
"""
|
|
113
|
+
slug = _sanitize_slug(slug)
|
|
114
|
+
if max_live is not None and \
|
|
115
|
+
count_live(base_dir).get(slug, 0) >= max_live:
|
|
116
|
+
raise TooManyLiveError(
|
|
117
|
+
f'Slug {slug!r} already has {max_live} or more live runs.')
|
|
118
|
+
run_id = generate_run_id(slug)
|
|
119
|
+
run_dir = run_dir_for(run_id, base_dir)
|
|
120
|
+
os.makedirs(run_dir)
|
|
121
|
+
env = os.environ.copy()
|
|
122
|
+
env.update(env_updates or {})
|
|
123
|
+
env[RUN_DIR_ENV_VAR] = run_dir
|
|
124
|
+
with open(os.path.join(run_dir, 'stdout.log'), 'wb') as out, \
|
|
125
|
+
open(os.path.join(run_dir, 'stderr.log'), 'wb') as err:
|
|
126
|
+
proc = subprocess.Popen(
|
|
127
|
+
cmd, stdin=subprocess.DEVNULL, stdout=out, stderr=err,
|
|
128
|
+
env=env, start_new_session=True)
|
|
129
|
+
status = {
|
|
130
|
+
'run_id': run_id,
|
|
131
|
+
'slug': slug,
|
|
132
|
+
'state': 'running',
|
|
133
|
+
'pid': proc.pid,
|
|
134
|
+
'pid_create_time': psutil.Process(proc.pid).create_time(),
|
|
135
|
+
'cmd': list(cmd),
|
|
136
|
+
'description': description,
|
|
137
|
+
'launch_time': _now_iso(),
|
|
138
|
+
'observed_end_time': None,
|
|
139
|
+
'ttl_seconds': ttl_seconds,
|
|
140
|
+
'max_runtime_seconds': max_runtime_seconds,
|
|
141
|
+
'env_update_keys': sorted(env_updates or {}),
|
|
142
|
+
}
|
|
143
|
+
_atomic_write_json(os.path.join(run_dir, 'status.json'), status)
|
|
144
|
+
return run_id
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def post_message(text: str, **extra_fields: Any) -> None:
|
|
148
|
+
"""Append a status message; called from inside a launched process.
|
|
149
|
+
|
|
150
|
+
Each message is one JSON line in ``messages.jsonl`` containing a
|
|
151
|
+
timestamp, `text`, and any `extra_fields` (which must be
|
|
152
|
+
JSON-serializable). Single-line appends are atomic on POSIX, so
|
|
153
|
+
concurrent writers need no locking.
|
|
154
|
+
"""
|
|
155
|
+
record: dict[str, Any] = {'time': _now_iso(), 'text': text}
|
|
156
|
+
record.update(extra_fields)
|
|
157
|
+
path = os.path.join(_current_run_dir(), 'messages.jsonl')
|
|
158
|
+
with open(path, 'a', encoding='utf-8') as handle:
|
|
159
|
+
handle.write(json.dumps(record) + '\n')
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def write_result(result: Any) -> None:
|
|
163
|
+
"""Write the final result; called from inside a launched process.
|
|
164
|
+
|
|
165
|
+
`result` may be any JSON-serializable value (string, number,
|
|
166
|
+
dict, list, ...). By convention a launched process should always
|
|
167
|
+
call this on success (even with a trivial value); a dead process
|
|
168
|
+
with no result file is reported as state "died".
|
|
169
|
+
"""
|
|
170
|
+
path = os.path.join(_current_run_dir(), 'result.json')
|
|
171
|
+
_atomic_write_json(path, {'time': _now_iso(), 'result': result})
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_info(run_id: str, base_dir: Optional[str] = None,
|
|
175
|
+
tail: int = 10) -> Optional[dict[str, Any]]:
|
|
176
|
+
"""Return a summary dict for `run_id`, or None if unknown.
|
|
177
|
+
|
|
178
|
+
The summary includes the status fields, elapsed runtime, the last
|
|
179
|
+
`tail` status messages, the last `tail` lines of stdout and
|
|
180
|
+
stderr, and the result (if written). If the process has died
|
|
181
|
+
without a recorded end, the status file is updated accordingly.
|
|
182
|
+
"""
|
|
183
|
+
run_dir = run_dir_for(run_id, base_dir)
|
|
184
|
+
status = _refresh_status(run_dir)
|
|
185
|
+
if status is None:
|
|
186
|
+
return None
|
|
187
|
+
launch_time = _parse_iso(status['launch_time'])
|
|
188
|
+
end_time = (_parse_iso(status['observed_end_time'])
|
|
189
|
+
or datetime.datetime.now())
|
|
190
|
+
elapsed = (end_time - launch_time).total_seconds() if launch_time \
|
|
191
|
+
else None
|
|
192
|
+
messages = [_read_json_line(line) for line in
|
|
193
|
+
_tail_lines(os.path.join(run_dir, 'messages.jsonl'), tail)]
|
|
194
|
+
result = _read_json(os.path.join(run_dir, 'result.json'))
|
|
195
|
+
return {
|
|
196
|
+
**status,
|
|
197
|
+
'elapsed_seconds': elapsed,
|
|
198
|
+
'messages': [msg for msg in messages if msg is not None],
|
|
199
|
+
'stdout_tail': _tail_lines(os.path.join(run_dir, 'stdout.log'),
|
|
200
|
+
tail),
|
|
201
|
+
'stderr_tail': _tail_lines(os.path.join(run_dir, 'stderr.log'),
|
|
202
|
+
tail),
|
|
203
|
+
'result': result,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def count_live(base_dir: Optional[str] = None) -> dict[str, int]:
|
|
208
|
+
"""Return a mapping of slug to number of still-running runs.
|
|
209
|
+
|
|
210
|
+
Slugs with no live runs are omitted, so use
|
|
211
|
+
``count_live().get(slug, 0)`` to query a single slug.
|
|
212
|
+
"""
|
|
213
|
+
counts: dict[str, int] = {}
|
|
214
|
+
for run_dir in _iter_run_dirs(base_dir):
|
|
215
|
+
status = _refresh_status(run_dir)
|
|
216
|
+
if status is not None and status['state'] == 'running':
|
|
217
|
+
slug = status['slug']
|
|
218
|
+
counts[slug] = counts.get(slug, 0) + 1
|
|
219
|
+
return counts
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def kill_run(run_id: str, base_dir: Optional[str] = None) -> bool:
|
|
223
|
+
"""Kill the process group of `run_id` if it is still alive.
|
|
224
|
+
|
|
225
|
+
Returns True if a kill signal was sent, False if the run was
|
|
226
|
+
already dead or unknown. The whole process group is signaled
|
|
227
|
+
(the child was launched in its own session), so grandchildren
|
|
228
|
+
are included.
|
|
229
|
+
"""
|
|
230
|
+
run_dir = run_dir_for(run_id, base_dir)
|
|
231
|
+
status = _refresh_status(run_dir)
|
|
232
|
+
if status is None or status['state'] != 'running':
|
|
233
|
+
return False
|
|
234
|
+
try:
|
|
235
|
+
os.killpg(status['pid'], 15) # SIGTERM to the whole group
|
|
236
|
+
except (ProcessLookupError, PermissionError):
|
|
237
|
+
return False
|
|
238
|
+
_finalize_status(run_dir, status, 'killed')
|
|
239
|
+
return True
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def cleanup(base_dir: Optional[str] = None,
|
|
243
|
+
ttl_override: Optional[float] = None) -> dict[str, list[str]]:
|
|
244
|
+
"""Perform periodic maintenance over all runs under `base_dir`.
|
|
245
|
+
|
|
246
|
+
Live runs that have exceeded their per-run `max_runtime_seconds`
|
|
247
|
+
are killed. Ended runs are deleted once their TTL (counted from
|
|
248
|
+
observed end time, or `ttl_override` if given) has passed. Runs
|
|
249
|
+
whose end was never observed are deleted via a backstop counted
|
|
250
|
+
from launch time. Returns a dict with lists of killed and
|
|
251
|
+
deleted run IDs.
|
|
252
|
+
"""
|
|
253
|
+
now = datetime.datetime.now()
|
|
254
|
+
killed: list[str] = []
|
|
255
|
+
deleted: list[str] = []
|
|
256
|
+
for run_dir in list(_iter_run_dirs(base_dir)):
|
|
257
|
+
status = _refresh_status(run_dir)
|
|
258
|
+
run_id = os.path.basename(run_dir)
|
|
259
|
+
if status is not None and status['state'] == 'running':
|
|
260
|
+
launch_time = _parse_iso(status['launch_time'])
|
|
261
|
+
limit = status.get('max_runtime_seconds',
|
|
262
|
+
DEFAULT_MAX_RUNTIME_SECONDS)
|
|
263
|
+
if launch_time is not None and limit is not None and \
|
|
264
|
+
(now - launch_time).total_seconds() > limit:
|
|
265
|
+
if kill_run(run_id, base_dir):
|
|
266
|
+
killed.append(run_id)
|
|
267
|
+
continue
|
|
268
|
+
if _should_delete(status, run_dir, now, ttl_override):
|
|
269
|
+
_remove_run_dir(run_dir)
|
|
270
|
+
deleted.append(run_id)
|
|
271
|
+
return {'killed': killed, 'deleted': deleted}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ----------------------------------------------------------------------
|
|
275
|
+
# Private helpers
|
|
276
|
+
# ----------------------------------------------------------------------
|
|
277
|
+
|
|
278
|
+
def _sanitize_slug(slug: str) -> str:
|
|
279
|
+
"""Validate `slug`, returning it unchanged or raising ValueError."""
|
|
280
|
+
if not slug or not set(slug) <= _SLUG_CHARS:
|
|
281
|
+
raise ValueError(
|
|
282
|
+
f'Invalid slug {slug!r}: use only lowercase letters, digits, '
|
|
283
|
+
'underscore, and hyphen.')
|
|
284
|
+
return slug
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _slug_from_run_id(run_id: str) -> str:
|
|
288
|
+
"""Extract the slug portion from a run ID."""
|
|
289
|
+
parts = run_id.rsplit('-', 2)
|
|
290
|
+
if len(parts) != 3:
|
|
291
|
+
raise ValueError(f'Malformed run ID: {run_id!r}')
|
|
292
|
+
return parts[0]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _atomic_write_json(path: str, data: Any) -> None:
|
|
296
|
+
"""Write `data` as JSON to `path` atomically (temp file + rename)."""
|
|
297
|
+
tmp_path = f'{path}.tmp.{os.getpid()}'
|
|
298
|
+
with open(tmp_path, 'w', encoding='utf-8') as handle:
|
|
299
|
+
json.dump(data, handle, indent=2)
|
|
300
|
+
os.replace(tmp_path, path)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _read_json(path: str) -> Any:
|
|
304
|
+
"""Return parsed JSON from `path`, or None if absent/unreadable."""
|
|
305
|
+
try:
|
|
306
|
+
with open(path, encoding='utf-8') as handle:
|
|
307
|
+
return json.load(handle)
|
|
308
|
+
except (OSError, ValueError):
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _read_json_line(line: str) -> Any:
|
|
313
|
+
"""Parse one JSON line, returning None on failure."""
|
|
314
|
+
try:
|
|
315
|
+
return json.loads(line)
|
|
316
|
+
except ValueError:
|
|
317
|
+
return None
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _now_iso() -> str:
|
|
321
|
+
"""Return the current local time as an ISO-8601 string."""
|
|
322
|
+
return datetime.datetime.now().isoformat(timespec='seconds')
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _parse_iso(text: Optional[str]) -> Optional[datetime.datetime]:
|
|
326
|
+
"""Parse an ISO-8601 string into a datetime, or return None."""
|
|
327
|
+
try:
|
|
328
|
+
return datetime.datetime.fromisoformat(text) # type: ignore[arg-type]
|
|
329
|
+
except (TypeError, ValueError):
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _current_run_dir() -> str:
|
|
334
|
+
"""Return this process's run directory from the environment."""
|
|
335
|
+
run_dir = os.environ.get(RUN_DIR_ENV_VAR)
|
|
336
|
+
if not run_dir:
|
|
337
|
+
raise OxProcError(
|
|
338
|
+
f'{RUN_DIR_ENV_VAR} is not set; was this process launched '
|
|
339
|
+
'via ox_proc.launch()?')
|
|
340
|
+
return run_dir
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _is_alive(status: dict[str, Any]) -> bool:
|
|
344
|
+
"""Return True if the process described by `status` is still alive.
|
|
345
|
+
|
|
346
|
+
Checks both PID existence and the creation time recorded at
|
|
347
|
+
launch, so a reused PID is not mistaken for our process.
|
|
348
|
+
"""
|
|
349
|
+
try:
|
|
350
|
+
proc = psutil.Process(status['pid'])
|
|
351
|
+
if proc.create_time() != status['pid_create_time']:
|
|
352
|
+
return False
|
|
353
|
+
# An exited-but-unreaped child (the launcher never waited on
|
|
354
|
+
# it) lingers as a zombie; treat that as dead.
|
|
355
|
+
return proc.status() != psutil.STATUS_ZOMBIE
|
|
356
|
+
except psutil.NoSuchProcess:
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _finalize_status(run_dir: str, status: dict[str, Any],
|
|
361
|
+
state: str) -> None:
|
|
362
|
+
"""Mark `status` as ended with `state` and rewrite status.json.
|
|
363
|
+
|
|
364
|
+
Multiple observers may race to do this; they write nearly
|
|
365
|
+
identical content atomically, so last-writer-wins is harmless.
|
|
366
|
+
"""
|
|
367
|
+
status['state'] = state
|
|
368
|
+
status['observed_end_time'] = _now_iso()
|
|
369
|
+
_atomic_write_json(os.path.join(run_dir, 'status.json'), status)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _refresh_status(run_dir: str) -> Optional[dict[str, Any]]:
|
|
373
|
+
"""Load status.json, reconciling state with actual liveness.
|
|
374
|
+
|
|
375
|
+
Returns the (possibly updated) status dict, or None if the run
|
|
376
|
+
directory has no readable status file.
|
|
377
|
+
"""
|
|
378
|
+
status = _read_json(os.path.join(run_dir, 'status.json'))
|
|
379
|
+
if status is None:
|
|
380
|
+
return None
|
|
381
|
+
if status['state'] == 'running' and not _is_alive(status):
|
|
382
|
+
has_result = os.path.exists(os.path.join(run_dir, 'result.json'))
|
|
383
|
+
_finalize_status(run_dir, status, 'finished' if has_result
|
|
384
|
+
else 'died')
|
|
385
|
+
return status
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _tail_lines(path: str, num_lines: int,
|
|
389
|
+
max_bytes: int = 65536) -> list[str]:
|
|
390
|
+
"""Return up to the last `num_lines` lines of the file at `path`."""
|
|
391
|
+
try:
|
|
392
|
+
with open(path, 'rb') as handle:
|
|
393
|
+
handle.seek(0, os.SEEK_END)
|
|
394
|
+
size = handle.tell()
|
|
395
|
+
handle.seek(max(0, size - max_bytes))
|
|
396
|
+
data = handle.read()
|
|
397
|
+
except OSError:
|
|
398
|
+
return []
|
|
399
|
+
text = data.decode('utf-8', errors='replace')
|
|
400
|
+
return text.splitlines()[-num_lines:]
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _iter_run_dirs(base_dir: Optional[str] = None) -> Iterator[str]:
|
|
404
|
+
"""Yield all run directory paths under `base_dir`."""
|
|
405
|
+
base_dir = base_dir or default_base_dir()
|
|
406
|
+
try:
|
|
407
|
+
slug_names = sorted(os.listdir(base_dir))
|
|
408
|
+
except OSError:
|
|
409
|
+
return
|
|
410
|
+
for slug_name in slug_names:
|
|
411
|
+
slug_dir = os.path.join(base_dir, slug_name)
|
|
412
|
+
try:
|
|
413
|
+
names = sorted(os.listdir(slug_dir))
|
|
414
|
+
except OSError:
|
|
415
|
+
continue
|
|
416
|
+
for name in names:
|
|
417
|
+
yield os.path.join(slug_dir, name)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _remove_run_dir(run_dir: str) -> None:
|
|
421
|
+
"""Best-effort removal of a run directory and its contents."""
|
|
422
|
+
try:
|
|
423
|
+
for name in os.listdir(run_dir):
|
|
424
|
+
os.unlink(os.path.join(run_dir, name))
|
|
425
|
+
os.rmdir(run_dir)
|
|
426
|
+
except OSError:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _should_delete(status: Optional[dict[str, Any]], run_dir: str,
|
|
431
|
+
now: datetime.datetime,
|
|
432
|
+
ttl_override: Optional[float]) -> bool:
|
|
433
|
+
"""Return True if a finished/orphaned run is past its retention."""
|
|
434
|
+
if status is None: # unreadable/orphan dir: use backstop via mtime
|
|
435
|
+
try:
|
|
436
|
+
age = now.timestamp() - os.path.getmtime(run_dir)
|
|
437
|
+
except OSError:
|
|
438
|
+
return False
|
|
439
|
+
return age > DEFAULT_MAX_AGE_SECONDS
|
|
440
|
+
ttl = ttl_override if ttl_override is not None \
|
|
441
|
+
else status.get('ttl_seconds', DEFAULT_TTL_SECONDS)
|
|
442
|
+
end_time = _parse_iso(status.get('observed_end_time'))
|
|
443
|
+
if end_time is not None:
|
|
444
|
+
return (now - end_time).total_seconds() > ttl
|
|
445
|
+
launch_time = _parse_iso(status.get('launch_time'))
|
|
446
|
+
if launch_time is not None: # end never observed: backstop from start
|
|
447
|
+
return (now - launch_time).total_seconds() > DEFAULT_MAX_AGE_SECONDS
|
|
448
|
+
return False
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ox_proc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Launch, monitor, and clean up detached external processes with on-disk status tracking (no broker or worker required).
|
|
5
|
+
Project-URL: Homepage, https://github.com/aocks/ox_proc
|
|
6
|
+
Author-email: Emin Martinian <emin.martinian@gmail.com>
|
|
7
|
+
License-Expression: BSD-2-Clause
|
|
8
|
+
Keywords: background,detached,job,status,subprocess
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Operating System :: POSIX
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: System :: Monitoring
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Requires-Dist: psutil>=5.9
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pycodestyle; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# ox_proc
|
|
22
|
+
|
|
23
|
+
Launch, monitor, and clean up **detached** external processes with
|
|
24
|
+
simple on-disk status tracking — no message broker, no persistent
|
|
25
|
+
worker process.
|
|
26
|
+
|
|
27
|
+
`ox_proc` is aimed at the common case where a short-lived caller
|
|
28
|
+
(e.g., a Flask/gunicorn request handler) needs to kick off a
|
|
29
|
+
long-running command, return immediately, and let *any* later process
|
|
30
|
+
check progress, read recent output, fetch the final result, or kill
|
|
31
|
+
the run.
|
|
32
|
+
|
|
33
|
+
## How it works
|
|
34
|
+
|
|
35
|
+
Each launch gets a run ID like `myslug-20260611T143022-a1b2` and a
|
|
36
|
+
run directory:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
<base_dir>/<slug>/<run_id>/
|
|
40
|
+
status.json written by the launcher; finalized by observers
|
|
41
|
+
messages.jsonl appended by the child via post_message()
|
|
42
|
+
result.json written by the child via write_result()
|
|
43
|
+
stdout.log child's stdout
|
|
44
|
+
stderr.log child's stderr
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
The child is launched in its own session (`setsid`), so it survives
|
|
48
|
+
the launcher's exit. Liveness checks verify both the PID and the
|
|
49
|
+
process creation time recorded at launch, so a reused PID is never
|
|
50
|
+
mistaken for the original process. No file locking is needed:
|
|
51
|
+
`status.json` and `result.json` use atomic write-and-rename, and
|
|
52
|
+
`messages.jsonl` is append-only.
|
|
53
|
+
|
|
54
|
+
## Quick start
|
|
55
|
+
|
|
56
|
+
Launcher side (e.g., in a web request handler):
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import ox_proc
|
|
60
|
+
|
|
61
|
+
run_id = ox_proc.launch(
|
|
62
|
+
["python", "-m", "myproject.analysis", "--full"],
|
|
63
|
+
slug="analysis",
|
|
64
|
+
description="Nightly full analysis",
|
|
65
|
+
env_updates={"ANALYSIS_MODE": "full"},
|
|
66
|
+
max_live=1, # raise TooManyLiveError if one is running
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Child side (inside the launched command):
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import ox_proc
|
|
74
|
+
|
|
75
|
+
ox_proc.post_message("loading data", progress=0.1)
|
|
76
|
+
...
|
|
77
|
+
ox_proc.write_result({"rows": 12345, "ok": True}) # always on success
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Status side (any process, any time):
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
info = ox_proc.get_info(run_id)
|
|
84
|
+
info["state"] # "running", "finished", "died", or "killed"
|
|
85
|
+
info["messages"] # recent post_message() records
|
|
86
|
+
info["stdout_tail"] # last lines of stdout
|
|
87
|
+
info["result"] # contents of result.json, or None
|
|
88
|
+
|
|
89
|
+
ox_proc.count_live() # {"analysis": 1, ...}
|
|
90
|
+
ox_proc.kill_run(run_id)
|
|
91
|
+
ox_proc.cleanup() # call periodically: kills over-runtime runs,
|
|
92
|
+
# deletes expired finished runs
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Conventions and caveats
|
|
96
|
+
|
|
97
|
+
* **Always call `write_result()` on success.** Because the child is
|
|
98
|
+
detached, its true exit code is lost; a dead process with no
|
|
99
|
+
`result.json` is reported as state `"died"`.
|
|
100
|
+
* End times are *observed*: recorded when an observer first notices
|
|
101
|
+
the process is gone. TTL-based deletion counts from that time, with
|
|
102
|
+
a backstop (default 24 h from launch) for runs whose end was never
|
|
103
|
+
observed.
|
|
104
|
+
* Live runs are killed by `cleanup()` once they exceed their per-run
|
|
105
|
+
`max_runtime_seconds` (default 8 h; pass `None` for unlimited).
|
|
106
|
+
* `kill_run()` sends SIGTERM to the whole process group; there is no
|
|
107
|
+
SIGKILL escalation.
|
|
108
|
+
* The default base directory is
|
|
109
|
+
`tempfile.gettempdir()/ox_proc-<username>`, which the OS may purge
|
|
110
|
+
(reboots, tmpfiles cleaning) — run history is not durable. Pass
|
|
111
|
+
`base_dir=` everywhere for a durable location.
|
|
112
|
+
* POSIX only (relies on sessions/process groups and atomic appends).
|
|
113
|
+
* The `max_live` limit is best-effort: two simultaneous launches can
|
|
114
|
+
race past it.
|
|
115
|
+
|
|
116
|
+
## Installation
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
pip install ox_proc
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Requires Python 3.9+ and `psutil`.
|
|
123
|
+
|
|
124
|
+
## Development
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
pip install -e ".[dev]"
|
|
128
|
+
pytest
|
|
129
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
ox_proc/__init__.py,sha256=AyEB73wmYMQgffYGaV3D8wvmPlMihXYfP8-b8g3hIyw,912
|
|
2
|
+
ox_proc/core.py,sha256=u6iu7A9gCbSxQhCSZw6sf2zUuN5BWYYLbeV0DS-noSE,16932
|
|
3
|
+
ox_proc-0.1.0.dist-info/METADATA,sha256=gJHfJMPc9cDnJ50sFydVC4uU5ZaCIQ1IEMbV8GogwLI,4200
|
|
4
|
+
ox_proc-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
5
|
+
ox_proc-0.1.0.dist-info/RECORD,,
|