ox-proc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ox_proc-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,129 @@
1
+ Metadata-Version: 2.4
2
+ Name: ox_proc
3
+ Version: 0.1.0
4
+ Summary: Launch, monitor, and clean up detached external processes with on-disk status tracking (no broker or worker required).
5
+ Project-URL: Homepage, https://github.com/aocks/ox_proc
6
+ Author-email: Emin Martinian <emin.martinian@gmail.com>
7
+ License-Expression: BSD-2-Clause
8
+ Keywords: background,detached,job,status,subprocess
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: POSIX
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: System :: Monitoring
14
+ Requires-Python: >=3.9
15
+ Requires-Dist: psutil>=5.9
16
+ Provides-Extra: dev
17
+ Requires-Dist: pycodestyle; extra == 'dev'
18
+ Requires-Dist: pytest; extra == 'dev'
19
+ Description-Content-Type: text/markdown
20
+
21
+ # ox_proc
22
+
23
+ Launch, monitor, and clean up **detached** external processes with
24
+ simple on-disk status tracking — no message broker, no persistent
25
+ worker process.
26
+
27
+ `ox_proc` is aimed at the common case where a short-lived caller
28
+ (e.g., a Flask/gunicorn request handler) needs to kick off a
29
+ long-running command, return immediately, and let *any* later process
30
+ check progress, read recent output, fetch the final result, or kill
31
+ the run.
32
+
33
+ ## How it works
34
+
35
+ Each launch gets a run ID like `myslug-20260611T143022-a1b2` and a
36
+ run directory:
37
+
38
+ ```
39
+ <base_dir>/<slug>/<run_id>/
40
+ status.json written by the launcher; finalized by observers
41
+ messages.jsonl appended by the child via post_message()
42
+ result.json written by the child via write_result()
43
+ stdout.log child's stdout
44
+ stderr.log child's stderr
45
+ ```
46
+
47
+ The child is launched in its own session (`setsid`), so it survives
48
+ the launcher's exit. Liveness checks verify both the PID and the
49
+ process creation time recorded at launch, so a reused PID is never
50
+ mistaken for the original process. No file locking is needed:
51
+ `status.json` and `result.json` use atomic write-and-rename, and
52
+ `messages.jsonl` is append-only.
53
+
54
+ ## Quick start
55
+
56
+ Launcher side (e.g., in a web request handler):
57
+
58
+ ```python
59
+ import ox_proc
60
+
61
+ run_id = ox_proc.launch(
62
+ ["python", "-m", "myproject.analysis", "--full"],
63
+ slug="analysis",
64
+ description="Nightly full analysis",
65
+ env_updates={"ANALYSIS_MODE": "full"},
66
+ max_live=1, # raise TooManyLiveError if one is running
67
+ )
68
+ ```
69
+
70
+ Child side (inside the launched command):
71
+
72
+ ```python
73
+ import ox_proc
74
+
75
+ ox_proc.post_message("loading data", progress=0.1)
76
+ ...
77
+ ox_proc.write_result({"rows": 12345, "ok": True}) # always on success
78
+ ```
79
+
80
+ Status side (any process, any time):
81
+
82
+ ```python
83
+ info = ox_proc.get_info(run_id)
84
+ info["state"] # "running", "finished", "died", or "killed"
85
+ info["messages"] # recent post_message() records
86
+ info["stdout_tail"] # last lines of stdout
87
+ info["result"] # contents of result.json, or None
88
+
89
+ ox_proc.count_live() # {"analysis": 1, ...}
90
+ ox_proc.kill_run(run_id)
91
+ ox_proc.cleanup() # call periodically: kills over-runtime runs,
92
+ # deletes expired finished runs
93
+ ```
94
+
95
+ ## Conventions and caveats
96
+
97
+ * **Always call `write_result()` on success.** Because the child is
98
+ detached, its true exit code is lost; a dead process with no
99
+ `result.json` is reported as state `"died"`.
100
+ * End times are *observed*: recorded when an observer first notices
101
+ the process is gone. TTL-based deletion counts from that time, with
102
+ a backstop (default 24 h from launch) for runs whose end was never
103
+ observed.
104
+ * Live runs are killed by `cleanup()` once they exceed their per-run
105
+ `max_runtime_seconds` (default 8 h; pass `None` for unlimited).
106
+ * `kill_run()` sends SIGTERM to the whole process group; there is no
107
+ SIGKILL escalation.
108
+ * The default base directory is
109
+ `tempfile.gettempdir()/ox_proc-<username>`, which the OS may purge
110
+ (reboots, tmpfiles cleaning) — run history is not durable. Pass
111
+ `base_dir=` everywhere for a durable location.
112
+ * POSIX only (relies on sessions/process groups and atomic appends).
113
+ * The `max_live` limit is best-effort: two simultaneous launches can
114
+ race past it.
115
+
116
+ ## Installation
117
+
118
+ ```
119
+ pip install ox_proc
120
+ ```
121
+
122
+ Requires Python 3.9+ and `psutil`.
123
+
124
+ ## Development
125
+
126
+ ```
127
+ pip install -e ".[dev]"
128
+ pytest
129
+ ```
@@ -0,0 +1,109 @@
1
+ # ox_proc
2
+
3
+ Launch, monitor, and clean up **detached** external processes with
4
+ simple on-disk status tracking — no message broker, no persistent
5
+ worker process.
6
+
7
+ `ox_proc` is aimed at the common case where a short-lived caller
8
+ (e.g., a Flask/gunicorn request handler) needs to kick off a
9
+ long-running command, return immediately, and let *any* later process
10
+ check progress, read recent output, fetch the final result, or kill
11
+ the run.
12
+
13
+ ## How it works
14
+
15
+ Each launch gets a run ID like `myslug-20260611T143022-a1b2` and a
16
+ run directory:
17
+
18
+ ```
19
+ <base_dir>/<slug>/<run_id>/
20
+ status.json written by the launcher; finalized by observers
21
+ messages.jsonl appended by the child via post_message()
22
+ result.json written by the child via write_result()
23
+ stdout.log child's stdout
24
+ stderr.log child's stderr
25
+ ```
26
+
27
+ The child is launched in its own session (`setsid`), so it survives
28
+ the launcher's exit. Liveness checks verify both the PID and the
29
+ process creation time recorded at launch, so a reused PID is never
30
+ mistaken for the original process. No file locking is needed:
31
+ `status.json` and `result.json` use atomic write-and-rename, and
32
+ `messages.jsonl` is append-only.
33
+
34
+ ## Quick start
35
+
36
+ Launcher side (e.g., in a web request handler):
37
+
38
+ ```python
39
+ import ox_proc
40
+
41
+ run_id = ox_proc.launch(
42
+ ["python", "-m", "myproject.analysis", "--full"],
43
+ slug="analysis",
44
+ description="Nightly full analysis",
45
+ env_updates={"ANALYSIS_MODE": "full"},
46
+ max_live=1, # raise TooManyLiveError if one is running
47
+ )
48
+ ```
49
+
50
+ Child side (inside the launched command):
51
+
52
+ ```python
53
+ import ox_proc
54
+
55
+ ox_proc.post_message("loading data", progress=0.1)
56
+ ...
57
+ ox_proc.write_result({"rows": 12345, "ok": True}) # always on success
58
+ ```
59
+
60
+ Status side (any process, any time):
61
+
62
+ ```python
63
+ info = ox_proc.get_info(run_id)
64
+ info["state"] # "running", "finished", "died", or "killed"
65
+ info["messages"] # recent post_message() records
66
+ info["stdout_tail"] # last lines of stdout
67
+ info["result"] # contents of result.json, or None
68
+
69
+ ox_proc.count_live() # {"analysis": 1, ...}
70
+ ox_proc.kill_run(run_id)
71
+ ox_proc.cleanup() # call periodically: kills over-runtime runs,
72
+ # deletes expired finished runs
73
+ ```
74
+
75
+ ## Conventions and caveats
76
+
77
+ * **Always call `write_result()` on success.** Because the child is
78
+ detached, its true exit code is lost; a dead process with no
79
+ `result.json` is reported as state `"died"`.
80
+ * End times are *observed*: recorded when an observer first notices
81
+ the process is gone. TTL-based deletion counts from that time, with
82
+ a backstop (default 24 h from launch) for runs whose end was never
83
+ observed.
84
+ * Live runs are killed by `cleanup()` once they exceed their per-run
85
+ `max_runtime_seconds` (default 8 h; pass `None` for unlimited).
86
+ * `kill_run()` sends SIGTERM to the whole process group; there is no
87
+ SIGKILL escalation.
88
+ * The default base directory is
89
+ `tempfile.gettempdir()/ox_proc-<username>`, which the OS may purge
90
+ (reboots, tmpfiles cleaning) — run history is not durable. Pass
91
+ `base_dir=` everywhere for a durable location.
92
+ * POSIX only (relies on sessions/process groups and atomic appends).
93
+ * The `max_live` limit is best-effort: two simultaneous launches can
94
+ race past it.
95
+
96
+ ## Installation
97
+
98
+ ```
99
+ pip install ox_proc
100
+ ```
101
+
102
+ Requires Python 3.9+ and `psutil`.
103
+
104
+ ## Development
105
+
106
+ ```
107
+ pip install -e ".[dev]"
108
+ pytest
109
+ ```
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "ox_proc"
7
+ version = "0.1.0"
8
+ description = "Launch, monitor, and clean up detached external processes with on-disk status tracking (no broker or worker required)."
9
+ readme = "README.md"
10
+ license = "BSD-2-Clause"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.9"
13
+ authors = [
14
+ {name = "Emin Martinian", email = "emin.martinian@gmail.com"},
15
+ ]
16
+ keywords = ["subprocess", "background", "detached", "job", "status"]
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Intended Audience :: Developers",
20
+ "Operating System :: POSIX",
21
+ "Programming Language :: Python :: 3",
22
+ "Topic :: System :: Monitoring",
23
+ ]
24
+ dependencies = [
25
+ "psutil>=5.9",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/aocks/ox_proc"
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest",
34
+ "pycodestyle",
35
+ ]
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/ox_proc"]
39
+
40
+ [tool.hatch.build.targets.sdist]
41
+ only-include = ["src", "tests", "README.md", "LICENSE", "pyproject.toml"]
@@ -0,0 +1,45 @@
1
+ """Launch, monitor, and clean up detached external processes.
2
+
3
+ See `ox_proc.core` for full documentation. The public API is
4
+ re-exported here so callers can simply ``import ox_proc``.
5
+ """
6
+
7
+ from ox_proc.core import (
8
+ DEFAULT_MAX_AGE_SECONDS,
9
+ DEFAULT_MAX_RUNTIME_SECONDS,
10
+ DEFAULT_TTL_SECONDS,
11
+ RUN_DIR_ENV_VAR,
12
+ OxProcError,
13
+ TooManyLiveError,
14
+ cleanup,
15
+ count_live,
16
+ default_base_dir,
17
+ generate_run_id,
18
+ get_info,
19
+ kill_run,
20
+ launch,
21
+ post_message,
22
+ run_dir_for,
23
+ write_result,
24
+ )
25
+
26
+ __version__ = '0.1.0'
27
+
28
+ __all__ = [
29
+ 'DEFAULT_MAX_AGE_SECONDS',
30
+ 'DEFAULT_MAX_RUNTIME_SECONDS',
31
+ 'DEFAULT_TTL_SECONDS',
32
+ 'RUN_DIR_ENV_VAR',
33
+ 'OxProcError',
34
+ 'TooManyLiveError',
35
+ 'cleanup',
36
+ 'count_live',
37
+ 'default_base_dir',
38
+ 'generate_run_id',
39
+ 'get_info',
40
+ 'kill_run',
41
+ 'launch',
42
+ 'post_message',
43
+ 'run_dir_for',
44
+ 'write_result',
45
+ ]
@@ -0,0 +1,448 @@
1
+ """Launch, monitor, and clean up detached external processes.
2
+
3
+ The `ox_proc` package lets a short-lived caller (e.g., a web request
4
+ handler) launch a long-running command as a *detached* subprocess and
5
+ later inspect its progress from any other process, with no broker or
6
+ persistent worker required. All state lives on disk in a per-run
7
+ directory:
8
+
9
+ <base_dir>/<slug>/<run_id>/
10
+ status.json written by launcher; finalized by observers
11
+ messages.jsonl appended by the child via `post_message()`
12
+ result.json written by the child via `write_result()`
13
+ stdout.log child's stdout
14
+ stderr.log child's stderr
15
+
16
+ Conventions and caveats:
17
+
18
+ * The launched command should call `write_result()` on success. A
19
+ dead process with no ``result.json`` is reported as state "died".
20
+ * Because the child is detached, its true exit code and end time are
21
+ lost; observers record an *observed* end time when they first notice
22
+ the process is gone. TTL-based cleanup counts from that time.
23
+ * The default base directory lives under `tempfile.gettempdir()`, so
24
+ the operating system may purge it (e.g., on reboot or via tmpfiles
25
+ cleaning); run history is therefore not durable.
26
+ * Liveness checks verify both PID existence and the process creation
27
+ time recorded at launch, guarding against PID reuse.
28
+
29
+ This module is organized with the public API first and private
30
+ helpers (named with a leading underscore) gathered at the end.
31
+
32
+ Requires the `psutil` package.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import datetime
38
+ import getpass
39
+ import json
40
+ import os
41
+ import secrets
42
+ import subprocess
43
+ import tempfile
44
+ from typing import Any, Iterator, Optional, Sequence
45
+
46
+ import psutil
47
+
48
+ DEFAULT_TTL_SECONDS = 24 * 3600 # keep finished runs this long
49
+ DEFAULT_MAX_RUNTIME_SECONDS = 8 * 3600 # kill live runs after this
50
+ DEFAULT_MAX_AGE_SECONDS = 24 * 3600 # deletion backstop from launch
51
+ RUN_DIR_ENV_VAR = 'OX_PROC_RUN_DIR'
52
+
53
+ _SLUG_CHARS = set('abcdefghijklmnopqrstuvwxyz0123456789_-')
54
+
55
+
56
+ class OxProcError(Exception):
57
+ """Base class for errors raised by this package."""
58
+
59
+
60
+ class TooManyLiveError(OxProcError):
61
+ """Raised when a launch would exceed the live-run limit for a slug."""
62
+
63
+
64
+ # ----------------------------------------------------------------------
65
+ # Public API
66
+ # ----------------------------------------------------------------------
67
+
68
+ def default_base_dir() -> str:
69
+ """Return the default base directory for run tracking.
70
+
71
+ Includes the username to avoid collisions between users sharing
72
+ the system temporary directory.
73
+ """
74
+ user = getpass.getuser()
75
+ return os.path.join(tempfile.gettempdir(), f'ox_proc-{user}')
76
+
77
+
78
+ def generate_run_id(slug: str) -> str:
79
+ """Return a new run ID of the form ``<slug>-<timestamp>-<rand>``."""
80
+ stamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
81
+ return f'{_sanitize_slug(slug)}-{stamp}-{secrets.token_hex(2)}'
82
+
83
+
84
+ def run_dir_for(run_id: str, base_dir: Optional[str] = None) -> str:
85
+ """Return the directory path holding files for `run_id`."""
86
+ base_dir = base_dir or default_base_dir()
87
+ return os.path.join(base_dir, _slug_from_run_id(run_id), run_id)
88
+
89
+
90
+ def launch(cmd: Sequence[str], slug: str,
91
+ description: Optional[str] = None,
92
+ env_updates: Optional[dict[str, str]] = None,
93
+ base_dir: Optional[str] = None,
94
+ ttl_seconds: float = DEFAULT_TTL_SECONDS,
95
+ max_runtime_seconds: Optional[float] =
96
+ DEFAULT_MAX_RUNTIME_SECONDS,
97
+ max_live: Optional[int] = None) -> str:
98
+ """Launch `cmd` (a list of strings) as a detached subprocess.
99
+
100
+ The child is placed in its own session so it survives the caller's
101
+ exit; its stdout/stderr go to log files in the run directory, and
102
+ the environment variable named by `RUN_DIR_ENV_VAR` points the
103
+ child at that directory so it can use `post_message()` and
104
+ `write_result()`.
105
+
106
+ `env_updates`, if given, is merged into a copy of the caller's
107
+ environment. If `max_live` is given and at least that many runs
108
+ with this slug are still alive, `TooManyLiveError` is raised (a
109
+ small launch race is accepted; the limit is best-effort).
110
+
111
+ Returns the new run ID.
112
+ """
113
+ slug = _sanitize_slug(slug)
114
+ if max_live is not None and \
115
+ count_live(base_dir).get(slug, 0) >= max_live:
116
+ raise TooManyLiveError(
117
+ f'Slug {slug!r} already has {max_live} or more live runs.')
118
+ run_id = generate_run_id(slug)
119
+ run_dir = run_dir_for(run_id, base_dir)
120
+ os.makedirs(run_dir)
121
+ env = os.environ.copy()
122
+ env.update(env_updates or {})
123
+ env[RUN_DIR_ENV_VAR] = run_dir
124
+ with open(os.path.join(run_dir, 'stdout.log'), 'wb') as out, \
125
+ open(os.path.join(run_dir, 'stderr.log'), 'wb') as err:
126
+ proc = subprocess.Popen(
127
+ cmd, stdin=subprocess.DEVNULL, stdout=out, stderr=err,
128
+ env=env, start_new_session=True)
129
+ status = {
130
+ 'run_id': run_id,
131
+ 'slug': slug,
132
+ 'state': 'running',
133
+ 'pid': proc.pid,
134
+ 'pid_create_time': psutil.Process(proc.pid).create_time(),
135
+ 'cmd': list(cmd),
136
+ 'description': description,
137
+ 'launch_time': _now_iso(),
138
+ 'observed_end_time': None,
139
+ 'ttl_seconds': ttl_seconds,
140
+ 'max_runtime_seconds': max_runtime_seconds,
141
+ 'env_update_keys': sorted(env_updates or {}),
142
+ }
143
+ _atomic_write_json(os.path.join(run_dir, 'status.json'), status)
144
+ return run_id
145
+
146
+
147
+ def post_message(text: str, **extra_fields: Any) -> None:
148
+ """Append a status message; called from inside a launched process.
149
+
150
+ Each message is one JSON line in ``messages.jsonl`` containing a
151
+ timestamp, `text`, and any `extra_fields` (which must be
152
+ JSON-serializable). Single-line appends are atomic on POSIX, so
153
+ concurrent writers need no locking.
154
+ """
155
+ record: dict[str, Any] = {'time': _now_iso(), 'text': text}
156
+ record.update(extra_fields)
157
+ path = os.path.join(_current_run_dir(), 'messages.jsonl')
158
+ with open(path, 'a', encoding='utf-8') as handle:
159
+ handle.write(json.dumps(record) + '\n')
160
+
161
+
162
+ def write_result(result: Any) -> None:
163
+ """Write the final result; called from inside a launched process.
164
+
165
+ `result` may be any JSON-serializable value (string, number,
166
+ dict, list, ...). By convention a launched process should always
167
+ call this on success (even with a trivial value); a dead process
168
+ with no result file is reported as state "died".
169
+ """
170
+ path = os.path.join(_current_run_dir(), 'result.json')
171
+ _atomic_write_json(path, {'time': _now_iso(), 'result': result})
172
+
173
+
174
+ def get_info(run_id: str, base_dir: Optional[str] = None,
175
+ tail: int = 10) -> Optional[dict[str, Any]]:
176
+ """Return a summary dict for `run_id`, or None if unknown.
177
+
178
+ The summary includes the status fields, elapsed runtime, the last
179
+ `tail` status messages, the last `tail` lines of stdout and
180
+ stderr, and the result (if written). If the process has died
181
+ without a recorded end, the status file is updated accordingly.
182
+ """
183
+ run_dir = run_dir_for(run_id, base_dir)
184
+ status = _refresh_status(run_dir)
185
+ if status is None:
186
+ return None
187
+ launch_time = _parse_iso(status['launch_time'])
188
+ end_time = (_parse_iso(status['observed_end_time'])
189
+ or datetime.datetime.now())
190
+ elapsed = (end_time - launch_time).total_seconds() if launch_time \
191
+ else None
192
+ messages = [_read_json_line(line) for line in
193
+ _tail_lines(os.path.join(run_dir, 'messages.jsonl'), tail)]
194
+ result = _read_json(os.path.join(run_dir, 'result.json'))
195
+ return {
196
+ **status,
197
+ 'elapsed_seconds': elapsed,
198
+ 'messages': [msg for msg in messages if msg is not None],
199
+ 'stdout_tail': _tail_lines(os.path.join(run_dir, 'stdout.log'),
200
+ tail),
201
+ 'stderr_tail': _tail_lines(os.path.join(run_dir, 'stderr.log'),
202
+ tail),
203
+ 'result': result,
204
+ }
205
+
206
+
207
+ def count_live(base_dir: Optional[str] = None) -> dict[str, int]:
208
+ """Return a mapping of slug to number of still-running runs.
209
+
210
+ Slugs with no live runs are omitted, so use
211
+ ``count_live().get(slug, 0)`` to query a single slug.
212
+ """
213
+ counts: dict[str, int] = {}
214
+ for run_dir in _iter_run_dirs(base_dir):
215
+ status = _refresh_status(run_dir)
216
+ if status is not None and status['state'] == 'running':
217
+ slug = status['slug']
218
+ counts[slug] = counts.get(slug, 0) + 1
219
+ return counts
220
+
221
+
222
+ def kill_run(run_id: str, base_dir: Optional[str] = None) -> bool:
223
+ """Kill the process group of `run_id` if it is still alive.
224
+
225
+ Returns True if a kill signal was sent, False if the run was
226
+ already dead or unknown. The whole process group is signaled
227
+ (the child was launched in its own session), so grandchildren
228
+ are included.
229
+ """
230
+ run_dir = run_dir_for(run_id, base_dir)
231
+ status = _refresh_status(run_dir)
232
+ if status is None or status['state'] != 'running':
233
+ return False
234
+ try:
235
+ os.killpg(status['pid'], 15) # SIGTERM to the whole group
236
+ except (ProcessLookupError, PermissionError):
237
+ return False
238
+ _finalize_status(run_dir, status, 'killed')
239
+ return True
240
+
241
+
242
+ def cleanup(base_dir: Optional[str] = None,
243
+ ttl_override: Optional[float] = None) -> dict[str, list[str]]:
244
+ """Perform periodic maintenance over all runs under `base_dir`.
245
+
246
+ Live runs that have exceeded their per-run `max_runtime_seconds`
247
+ are killed. Ended runs are deleted once their TTL (counted from
248
+ observed end time, or `ttl_override` if given) has passed. Runs
249
+ whose end was never observed are deleted via a backstop counted
250
+ from launch time. Returns a dict with lists of killed and
251
+ deleted run IDs.
252
+ """
253
+ now = datetime.datetime.now()
254
+ killed: list[str] = []
255
+ deleted: list[str] = []
256
+ for run_dir in list(_iter_run_dirs(base_dir)):
257
+ status = _refresh_status(run_dir)
258
+ run_id = os.path.basename(run_dir)
259
+ if status is not None and status['state'] == 'running':
260
+ launch_time = _parse_iso(status['launch_time'])
261
+ limit = status.get('max_runtime_seconds',
262
+ DEFAULT_MAX_RUNTIME_SECONDS)
263
+ if launch_time is not None and limit is not None and \
264
+ (now - launch_time).total_seconds() > limit:
265
+ if kill_run(run_id, base_dir):
266
+ killed.append(run_id)
267
+ continue
268
+ if _should_delete(status, run_dir, now, ttl_override):
269
+ _remove_run_dir(run_dir)
270
+ deleted.append(run_id)
271
+ return {'killed': killed, 'deleted': deleted}
272
+
273
+
274
+ # ----------------------------------------------------------------------
275
+ # Private helpers
276
+ # ----------------------------------------------------------------------
277
+
278
+ def _sanitize_slug(slug: str) -> str:
279
+ """Validate `slug`, returning it unchanged or raising ValueError."""
280
+ if not slug or not set(slug) <= _SLUG_CHARS:
281
+ raise ValueError(
282
+ f'Invalid slug {slug!r}: use only lowercase letters, digits, '
283
+ 'underscore, and hyphen.')
284
+ return slug
285
+
286
+
287
+ def _slug_from_run_id(run_id: str) -> str:
288
+ """Extract the slug portion from a run ID."""
289
+ parts = run_id.rsplit('-', 2)
290
+ if len(parts) != 3:
291
+ raise ValueError(f'Malformed run ID: {run_id!r}')
292
+ return parts[0]
293
+
294
+
295
+ def _atomic_write_json(path: str, data: Any) -> None:
296
+ """Write `data` as JSON to `path` atomically (temp file + rename)."""
297
+ tmp_path = f'{path}.tmp.{os.getpid()}'
298
+ with open(tmp_path, 'w', encoding='utf-8') as handle:
299
+ json.dump(data, handle, indent=2)
300
+ os.replace(tmp_path, path)
301
+
302
+
303
+ def _read_json(path: str) -> Any:
304
+ """Return parsed JSON from `path`, or None if absent/unreadable."""
305
+ try:
306
+ with open(path, encoding='utf-8') as handle:
307
+ return json.load(handle)
308
+ except (OSError, ValueError):
309
+ return None
310
+
311
+
312
+ def _read_json_line(line: str) -> Any:
313
+ """Parse one JSON line, returning None on failure."""
314
+ try:
315
+ return json.loads(line)
316
+ except ValueError:
317
+ return None
318
+
319
+
320
+ def _now_iso() -> str:
321
+ """Return the current local time as an ISO-8601 string."""
322
+ return datetime.datetime.now().isoformat(timespec='seconds')
323
+
324
+
325
+ def _parse_iso(text: Optional[str]) -> Optional[datetime.datetime]:
326
+ """Parse an ISO-8601 string into a datetime, or return None."""
327
+ try:
328
+ return datetime.datetime.fromisoformat(text) # type: ignore[arg-type]
329
+ except (TypeError, ValueError):
330
+ return None
331
+
332
+
333
+ def _current_run_dir() -> str:
334
+ """Return this process's run directory from the environment."""
335
+ run_dir = os.environ.get(RUN_DIR_ENV_VAR)
336
+ if not run_dir:
337
+ raise OxProcError(
338
+ f'{RUN_DIR_ENV_VAR} is not set; was this process launched '
339
+ 'via ox_proc.launch()?')
340
+ return run_dir
341
+
342
+
343
+ def _is_alive(status: dict[str, Any]) -> bool:
344
+ """Return True if the process described by `status` is still alive.
345
+
346
+ Checks both PID existence and the creation time recorded at
347
+ launch, so a reused PID is not mistaken for our process.
348
+ """
349
+ try:
350
+ proc = psutil.Process(status['pid'])
351
+ if proc.create_time() != status['pid_create_time']:
352
+ return False
353
+ # An exited-but-unreaped child (the launcher never waited on
354
+ # it) lingers as a zombie; treat that as dead.
355
+ return proc.status() != psutil.STATUS_ZOMBIE
356
+ except psutil.NoSuchProcess:
357
+ return False
358
+
359
+
360
+ def _finalize_status(run_dir: str, status: dict[str, Any],
361
+ state: str) -> None:
362
+ """Mark `status` as ended with `state` and rewrite status.json.
363
+
364
+ Multiple observers may race to do this; they write nearly
365
+ identical content atomically, so last-writer-wins is harmless.
366
+ """
367
+ status['state'] = state
368
+ status['observed_end_time'] = _now_iso()
369
+ _atomic_write_json(os.path.join(run_dir, 'status.json'), status)
370
+
371
+
372
+ def _refresh_status(run_dir: str) -> Optional[dict[str, Any]]:
373
+ """Load status.json, reconciling state with actual liveness.
374
+
375
+ Returns the (possibly updated) status dict, or None if the run
376
+ directory has no readable status file.
377
+ """
378
+ status = _read_json(os.path.join(run_dir, 'status.json'))
379
+ if status is None:
380
+ return None
381
+ if status['state'] == 'running' and not _is_alive(status):
382
+ has_result = os.path.exists(os.path.join(run_dir, 'result.json'))
383
+ _finalize_status(run_dir, status, 'finished' if has_result
384
+ else 'died')
385
+ return status
386
+
387
+
388
+ def _tail_lines(path: str, num_lines: int,
389
+ max_bytes: int = 65536) -> list[str]:
390
+ """Return up to the last `num_lines` lines of the file at `path`."""
391
+ try:
392
+ with open(path, 'rb') as handle:
393
+ handle.seek(0, os.SEEK_END)
394
+ size = handle.tell()
395
+ handle.seek(max(0, size - max_bytes))
396
+ data = handle.read()
397
+ except OSError:
398
+ return []
399
+ text = data.decode('utf-8', errors='replace')
400
+ return text.splitlines()[-num_lines:]
401
+
402
+
403
+ def _iter_run_dirs(base_dir: Optional[str] = None) -> Iterator[str]:
404
+ """Yield all run directory paths under `base_dir`."""
405
+ base_dir = base_dir or default_base_dir()
406
+ try:
407
+ slug_names = sorted(os.listdir(base_dir))
408
+ except OSError:
409
+ return
410
+ for slug_name in slug_names:
411
+ slug_dir = os.path.join(base_dir, slug_name)
412
+ try:
413
+ names = sorted(os.listdir(slug_dir))
414
+ except OSError:
415
+ continue
416
+ for name in names:
417
+ yield os.path.join(slug_dir, name)
418
+
419
+
420
+ def _remove_run_dir(run_dir: str) -> None:
421
+ """Best-effort removal of a run directory and its contents."""
422
+ try:
423
+ for name in os.listdir(run_dir):
424
+ os.unlink(os.path.join(run_dir, name))
425
+ os.rmdir(run_dir)
426
+ except OSError:
427
+ pass
428
+
429
+
430
+ def _should_delete(status: Optional[dict[str, Any]], run_dir: str,
431
+ now: datetime.datetime,
432
+ ttl_override: Optional[float]) -> bool:
433
+ """Return True if a finished/orphaned run is past its retention."""
434
+ if status is None: # unreadable/orphan dir: use backstop via mtime
435
+ try:
436
+ age = now.timestamp() - os.path.getmtime(run_dir)
437
+ except OSError:
438
+ return False
439
+ return age > DEFAULT_MAX_AGE_SECONDS
440
+ ttl = ttl_override if ttl_override is not None \
441
+ else status.get('ttl_seconds', DEFAULT_TTL_SECONDS)
442
+ end_time = _parse_iso(status.get('observed_end_time'))
443
+ if end_time is not None:
444
+ return (now - end_time).total_seconds() > ttl
445
+ launch_time = _parse_iso(status.get('launch_time'))
446
+ if launch_time is not None: # end never observed: backstop from start
447
+ return (now - launch_time).total_seconds() > DEFAULT_MAX_AGE_SECONDS
448
+ return False
@@ -0,0 +1,98 @@
1
+ """Smoke tests for ox_proc using short-lived child scripts."""
2
+
3
+ import sys
4
+ import textwrap
5
+ import time
6
+
7
+ import pytest
8
+
9
+ import ox_proc
10
+
11
+ CHILD_OK = textwrap.dedent("""
12
+ import ox_proc
13
+ ox_proc.post_message('step 1', progress=0.5)
14
+ print('hello stdout')
15
+ ox_proc.write_result({'answer': 42})
16
+ """)
17
+
18
+ CHILD_CRASH = textwrap.dedent("""
19
+ import ox_proc
20
+ ox_proc.post_message('about to crash')
21
+ raise SystemExit(3)
22
+ """)
23
+
24
+
25
+ def _launch_snippet(snippet, slug, base_dir, **kwargs):
26
+ return ox_proc.launch([sys.executable, '-c', snippet], slug,
27
+ base_dir=base_dir, **kwargs)
28
+
29
+
30
+ def _wait_for_end(run_id, base_dir, timeout=10.0):
31
+ deadline = time.time() + timeout
32
+ while time.time() < deadline:
33
+ info = ox_proc.get_info(run_id, base_dir)
34
+ if info['state'] != 'running':
35
+ return info
36
+ time.sleep(0.1)
37
+ raise AssertionError(f'Run {run_id} still running after {timeout}s')
38
+
39
+
40
+ def test_successful_run(tmp_path):
41
+ base = str(tmp_path)
42
+ run_id = _launch_snippet(CHILD_OK, 'ok', base, description='demo')
43
+ info = _wait_for_end(run_id, base)
44
+ assert info['state'] == 'finished'
45
+ assert info['result']['result'] == {'answer': 42}
46
+ assert info['messages'][-1]['text'] == 'step 1'
47
+ assert 'hello stdout' in info['stdout_tail']
48
+ assert info['description'] == 'demo'
49
+ assert info['elapsed_seconds'] >= 0
50
+
51
+
52
+ def test_crashed_run_reports_died(tmp_path):
53
+ base = str(tmp_path)
54
+ run_id = _launch_snippet(CHILD_CRASH, 'crash', base)
55
+ info = _wait_for_end(run_id, base)
56
+ assert info['state'] == 'died'
57
+ assert info['result'] is None
58
+
59
+
60
+ def test_kill_and_count_live(tmp_path):
61
+ base = str(tmp_path)
62
+ run_id = ox_proc.launch(['sleep', '60'], 'sleepy', base_dir=base)
63
+ assert ox_proc.count_live(base) == {'sleepy': 1}
64
+ assert ox_proc.kill_run(run_id, base)
65
+ info = _wait_for_end(run_id, base)
66
+ assert info['state'] == 'killed'
67
+ assert ox_proc.count_live(base) == {}
68
+
69
+
70
+ def test_max_live_limit(tmp_path):
71
+ base = str(tmp_path)
72
+ run_id = ox_proc.launch(['sleep', '60'], 'busy', base_dir=base)
73
+ with pytest.raises(ox_proc.TooManyLiveError):
74
+ ox_proc.launch(['sleep', '60'], 'busy', base_dir=base, max_live=1)
75
+ ox_proc.kill_run(run_id, base)
76
+
77
+
78
+ def test_cleanup_deletes_finished(tmp_path):
79
+ base = str(tmp_path)
80
+ run_id = _launch_snippet(CHILD_OK, 'gone', base)
81
+ _wait_for_end(run_id, base)
82
+ out = ox_proc.cleanup(base, ttl_override=0)
83
+ assert run_id in out['deleted']
84
+ assert ox_proc.get_info(run_id, base) is None
85
+
86
+
87
+ def test_cleanup_kills_over_runtime(tmp_path):
88
+ base = str(tmp_path)
89
+ run_id = ox_proc.launch(['sleep', '60'], 'long', base_dir=base,
90
+ max_runtime_seconds=0)
91
+ time.sleep(0.2)
92
+ out = ox_proc.cleanup(base)
93
+ assert run_id in out['killed']
94
+
95
+
96
+ def test_invalid_slug_rejected(tmp_path):
97
+ with pytest.raises(ValueError):
98
+ ox_proc.launch(['true'], 'Bad Slug!', base_dir=str(tmp_path))