cc-plugin-codex 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,561 @@
1
+ """Detached background jobs for long Claude reviews.
2
+
3
+ This server drives a one-shot ``claude -p --output-format json`` call, so a job's
4
+ terminal output is a single JSON envelope written to ``result.json`` — completion
5
+ is "the process exited and the envelope is present", with NO interactive-log or
6
+ TUI scraping. That makes background mode far simpler and more robust here than in
7
+ a harness that tails an interactive CLI.
8
+
9
+ State lives on disk (keyed by workspace), so status/result/cancel keep working
10
+ across MCP server restarts. There is no daemon: single-job lifecycle calls refresh
11
+ and TTL-clean the requested job, list calls clean the workspace, and the count cap
12
+ is enforced when jobs start. ``--max-budget-usd`` still applies its best-effort
13
+ spend stop threshold (not a hard cap) even for a job nobody polls.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import contextlib
19
+ import hashlib
20
+ import json
21
+ import os
22
+ import signal
23
+ import subprocess
24
+ import threading
25
+ import time
26
+ from dataclasses import dataclass
27
+ from datetime import UTC, datetime
28
+ from pathlib import Path
29
+ from typing import cast
30
+ from uuid import uuid4
31
+
32
+ from cc_plugin_codex.claude import contract_changed_error
33
+ from cc_plugin_codex.cli_contract import is_contract_drift
34
+ from cc_plugin_codex.normalize import apply_cost_usage, normalize_envelope
35
+ from cc_plugin_codex.schemas import (
36
+ FINGERPRINT,
37
+ ContextSummary,
38
+ ErrorCode,
39
+ ErrorInfo,
40
+ ErrorResult,
41
+ Meta,
42
+ workspace_warning_for,
43
+ )
44
+
45
+ STATE_ENV = "CC_PLUGIN_CODEX_STATE_DIR"
46
+ TTL_ENV = "CC_PLUGIN_CODEX_JOB_TTL"
47
+ MAX_SECONDS_ENV = "CC_PLUGIN_CODEX_JOB_MAX_SECONDS"
48
+ MAX_COUNT_ENV = "CC_PLUGIN_CODEX_JOB_MAX_COUNT"
49
+
50
+ DEFAULT_TTL = 86_400 # delete terminal job records after 24h
51
+ DEFAULT_MAX_SECONDS = 1_800 # wall-clock cap; a poll past this reaps the job
52
+ DEFAULT_MAX_COUNT = 50 # retained jobs per workspace; evict oldest terminal
53
+
54
+ _TERMINAL = {"done", "failed", "cancelled", "timeout"}
55
+ _JOBS_LOCK = threading.RLock()
56
+
57
+
58
+ def _int_env(name: str, default: int) -> int:
59
+ try:
60
+ return int(os.environ.get(name, ""))
61
+ except (TypeError, ValueError):
62
+ return default
63
+
64
+
65
+ def max_seconds() -> int:
66
+ return _int_env(MAX_SECONDS_ENV, DEFAULT_MAX_SECONDS)
67
+
68
+
69
+ def ttl_seconds() -> int:
70
+ return _int_env(TTL_ENV, DEFAULT_TTL)
71
+
72
+
73
+ def poll_after_ms() -> int:
74
+ return 1000
75
+
76
+
77
+ def _state_root() -> Path:
78
+ root = os.environ.get(STATE_ENV)
79
+ if root:
80
+ return Path(root)
81
+ return Path.home() / ".cache" / "cc-plugin-codex" / "jobs"
82
+
83
+
84
+ def _ws_dir(cwd: str) -> Path:
85
+ canonical = os.path.realpath(cwd)
86
+ digest = hashlib.sha256(canonical.encode()).hexdigest()[:12]
87
+ # os.path.basename on the realpath string keeps the dir-name derivation stable
88
+ # (and matches the digest input); Path.name differs on trailing-slash handling.
89
+ base = os.path.basename(canonical.rstrip("/")) or "workspace" # noqa: PTH119
90
+ safe = "".join(c if (c.isalnum() or c in "._-") else "-" for c in base)[:40] or "ws"
91
+ return _state_root() / f"{safe}-{digest}"
92
+
93
+
94
+ def _job_dir(cwd: str, job_id: str) -> Path:
95
+ return _ws_dir(cwd) / job_id
96
+
97
+
98
+ def _pid_alive(pid: int | None) -> bool:
99
+ if not pid:
100
+ return False
101
+ try:
102
+ os.kill(pid, 0)
103
+ except ProcessLookupError:
104
+ return False
105
+ except PermissionError:
106
+ return True
107
+ return True
108
+
109
+
110
+ def _is_running(pid: int | None) -> bool:
111
+ """Whether the job process is still running.
112
+
113
+ The job is launched detached but is still our child until it exits, so we
114
+ must reap it with waitpid — otherwise it lingers as a zombie that kill(0)
115
+ reports as 'alive' forever. waitpid(WNOHANG) returns (pid, _) once it exits
116
+ (reaping it), (0, 0) while it runs, and raises ChildProcessError if it is not
117
+ our child (e.g. after a server restart), where we fall back to a kill(0)
118
+ liveness probe."""
119
+ if not pid:
120
+ return False
121
+ try:
122
+ reaped, _ = os.waitpid(pid, os.WNOHANG)
123
+ if reaped == pid:
124
+ return False
125
+ if reaped == 0:
126
+ return True
127
+ except ChildProcessError:
128
+ pass # not our child — use the liveness probe below
129
+ except OSError:
130
+ return False
131
+ return _pid_alive(pid)
132
+
133
+
134
+ def _kill_pid_tree(pid: int | None) -> None:
135
+ """Kill the detached job's process group (it is its own session leader), then
136
+ reap it if it was our child so it does not linger as a zombie."""
137
+ if not pid:
138
+ return
139
+ try:
140
+ if hasattr(os, "killpg"):
141
+ os.killpg(os.getpgid(pid), signal.SIGKILL)
142
+ else: # pragma: no cover - non-POSIX fallback
143
+ os.kill(pid, signal.SIGKILL)
144
+ except (ProcessLookupError, PermissionError, OSError):
145
+ pass
146
+ with contextlib.suppress(ChildProcessError, OSError):
147
+ os.waitpid(pid, 0)
148
+
149
+
150
+ def _read_meta(jd: Path) -> dict | None:
151
+ try:
152
+ return json.loads((jd / "meta.json").read_text())
153
+ except (OSError, json.JSONDecodeError):
154
+ return None
155
+
156
+
157
+ def _write_meta(jd: Path, meta: dict) -> None:
158
+ (jd / "meta.json").write_text(json.dumps(meta))
159
+
160
+
161
+ def _read_envelope(jd: Path) -> dict | None:
162
+ """Parse the claude JSON envelope from result.json, or None if absent/partial."""
163
+ try:
164
+ text = (jd / "result.json").read_text()
165
+ except OSError:
166
+ return None
167
+ text = text.strip()
168
+ if not text:
169
+ return None
170
+ try:
171
+ env = json.loads(text)
172
+ except json.JSONDecodeError:
173
+ return None
174
+ return env if isinstance(env, dict) else None
175
+
176
+
177
+ @dataclass
178
+ class JobConfig:
179
+ kind: str
180
+ config_mode: str
181
+ access: str
182
+ scope: str | None
183
+ base: str | None
184
+ detail: str
185
+ timeout_seconds: int
186
+ workspace_source: str | None
187
+ context_summary: ContextSummary | None
188
+ requested_max_budget_usd: float | None = None
189
+ redacted_paths: list[str] | None = None
190
+
191
+
192
+ def start_job(cmd: list[str], cwd: str, cfg: JobConfig) -> tuple[str, str]:
193
+ """Spawn the claude command detached and persist its record.
194
+
195
+ Returns (job_id, started_at_iso)."""
196
+ job_id = uuid4().hex
197
+ jd = _job_dir(cwd, job_id)
198
+ jd.mkdir(parents=True, exist_ok=True)
199
+ # Best-effort: results contain the diff; keep the workspace tree user-only.
200
+ with contextlib.suppress(OSError):
201
+ _ws_dir(cwd).chmod(0o700)
202
+ started = time.time()
203
+ result_path = jd / "result.json"
204
+ stderr_path = jd / "stderr.log"
205
+ with result_path.open("w") as rf, stderr_path.open("w") as ef:
206
+ proc = subprocess.Popen(
207
+ cmd, cwd=cwd, stdout=rf, stderr=ef, text=True, start_new_session=True
208
+ )
209
+ summary = cfg.context_summary.model_dump() if cfg.context_summary else None
210
+ meta = {
211
+ "job_id": job_id,
212
+ "kind": cfg.kind,
213
+ "pid": proc.pid,
214
+ "started_epoch": started,
215
+ "started_at": datetime.now(UTC).isoformat(),
216
+ "deadline_epoch": started + max_seconds(),
217
+ "completed_epoch": None,
218
+ "terminal_status": None, # set by cancel/deadline reap
219
+ "config": {
220
+ "config_mode": cfg.config_mode,
221
+ "access": cfg.access,
222
+ "scope": cfg.scope,
223
+ "base": cfg.base,
224
+ "detail": cfg.detail,
225
+ "timeout_seconds": cfg.timeout_seconds,
226
+ "workspace_source": cfg.workspace_source,
227
+ "cwd": cwd,
228
+ "requested_max_budget_usd": cfg.requested_max_budget_usd,
229
+ "redacted_paths": cfg.redacted_paths or [],
230
+ },
231
+ "context_summary": summary,
232
+ }
233
+ _write_meta(jd, meta)
234
+ _enforce_count_cap(cwd)
235
+ return job_id, meta["started_at"]
236
+
237
+
238
+ def _status_of(jd: Path, meta: dict) -> str:
239
+ """Compute the live status, killing + marking jobs that overran their deadline."""
240
+ terminal = meta.get("terminal_status")
241
+ if terminal:
242
+ return terminal
243
+ if _is_running(meta.get("pid")):
244
+ if time.time() > meta.get("deadline_epoch", float("inf")):
245
+ _kill_pid_tree(meta.get("pid"))
246
+ meta["terminal_status"] = "timeout"
247
+ meta["completed_epoch"] = time.time()
248
+ _write_meta(jd, meta)
249
+ return "timeout"
250
+ return "running"
251
+ # Process gone: done if it left a parseable envelope, else it crashed.
252
+ if meta.get("completed_epoch") is None:
253
+ meta["completed_epoch"] = time.time()
254
+ _write_meta(jd, meta)
255
+ return "done" if _read_envelope(jd) is not None else "failed"
256
+
257
+
258
+ def _elapsed_ms(meta: dict) -> int:
259
+ end = meta.get("completed_epoch") or time.time()
260
+ return max(0, int((end - meta.get("started_epoch", end)) * 1000))
261
+
262
+
263
+ def _deadline_seconds(meta: dict) -> int:
264
+ """The wall-clock window the job was STARTED with (deadline minus start), not
265
+ the current env value — so status stays consistent if the env later changes."""
266
+ started = meta.get("started_epoch")
267
+ deadline = meta.get("deadline_epoch")
268
+ if started is not None and deadline is not None:
269
+ return max(0, round(deadline - started))
270
+ return max_seconds()
271
+
272
+
273
+ def _expires_at(meta: dict) -> str | None:
274
+ completed = meta.get("completed_epoch")
275
+ if completed is None:
276
+ return None
277
+ return datetime.fromtimestamp(completed + ttl_seconds(), UTC).isoformat()
278
+
279
+
280
+ def _reap_workspace(cwd: str) -> None:
281
+ """Lazy maintenance: refresh statuses and delete expired terminal records."""
282
+ ws = _ws_dir(cwd)
283
+ if not ws.is_dir():
284
+ return
285
+ ttl = ttl_seconds()
286
+ now = time.time()
287
+ for jd in ws.iterdir():
288
+ if not jd.is_dir():
289
+ continue
290
+ meta = _read_meta(jd)
291
+ if meta is None:
292
+ continue
293
+ status = _status_of(jd, meta)
294
+ if status in _TERMINAL:
295
+ end = meta.get("completed_epoch") or meta.get("started_epoch") or now
296
+ if now - end > ttl:
297
+ _rmtree(jd)
298
+
299
+
300
+ def _expired(meta: dict) -> bool:
301
+ completed = meta.get("completed_epoch")
302
+ if completed is None:
303
+ return False
304
+ return time.time() - completed > ttl_seconds()
305
+
306
+
307
+ def _read_live_job(cwd: str, job_id: str) -> tuple[Path, dict, str] | None:
308
+ """Read and refresh a single job record.
309
+
310
+ Status/result/cancel are commonly called in tight polling loops. Refreshing
311
+ only the requested record avoids unrelated jobs causing latency or waitpid
312
+ races while still preserving the TTL contract for that record.
313
+ """
314
+ jd = _job_dir(cwd, job_id)
315
+ meta = _read_meta(jd)
316
+ if meta is None:
317
+ return None
318
+ state = _status_of(jd, meta)
319
+ if state in _TERMINAL and _expired(meta):
320
+ _rmtree(jd)
321
+ return None
322
+ return jd, meta, state
323
+
324
+
325
+ def _enforce_count_cap(cwd: str) -> None:
326
+ ws = _ws_dir(cwd)
327
+ cap = _int_env(MAX_COUNT_ENV, DEFAULT_MAX_COUNT)
328
+ dirs = [d for d in ws.iterdir() if d.is_dir()] if ws.is_dir() else []
329
+ if len(dirs) <= cap:
330
+ return
331
+ # Evict oldest terminal jobs first; never kill a still-running one to fit.
332
+ scored = []
333
+ for jd in dirs:
334
+ meta = _read_meta(jd) or {}
335
+ status = _status_of(jd, meta)
336
+ scored.append((status in _TERMINAL, meta.get("started_epoch", 0.0), jd))
337
+ scored.sort(key=lambda t: (not t[0], t[1])) # terminal first, then oldest
338
+ for is_terminal, _epoch, jd in scored[: max(0, len(dirs) - cap)]:
339
+ if is_terminal:
340
+ _rmtree(jd)
341
+
342
+
343
+ def _rmtree(jd: Path) -> None:
344
+ try:
345
+ for child in jd.iterdir():
346
+ child.unlink(missing_ok=True)
347
+ jd.rmdir()
348
+ except OSError:
349
+ pass
350
+
351
+
352
+ def _build_meta(meta: dict) -> Meta:
353
+ c = meta.get("config", {})
354
+ cwd = c.get("cwd", "")
355
+ source = c.get("workspace_source")
356
+ return Meta(
357
+ cwd=cwd,
358
+ workspace_source=source,
359
+ workspace_warning=workspace_warning_for(source, cwd),
360
+ config_mode=c.get("config_mode", "inherit"),
361
+ access=c.get("access", "toolless"),
362
+ scope=c.get("scope"),
363
+ base=c.get("base"),
364
+ timeout_seconds=c.get("timeout_seconds", max_seconds()),
365
+ requested_max_budget_usd=c.get("requested_max_budget_usd"),
366
+ redacted_paths=c.get("redacted_paths") or [],
367
+ elapsed_ms=_elapsed_ms(meta),
368
+ job_id=meta.get("job_id"),
369
+ )
370
+
371
+
372
+ def _terminal_cost(jd: Path, state: str) -> float | None:
373
+ """Spend recorded by a terminal job, or None.
374
+
375
+ A cancelled/timeout job can still leave a parseable (possibly partial) envelope
376
+ that recorded cost, so we surface cost for ANY terminal state — matching the
377
+ result path (_job_error) and the JobStatus.cost_usd contract ('terminal jobs
378
+ that spent'), not just done."""
379
+ if state not in _TERMINAL:
380
+ return None
381
+ env = _read_envelope(jd) or {}
382
+ c = env.get("total_cost_usd")
383
+ return float(c) if isinstance(c, (int, float)) else None
384
+
385
+
386
+ def status(cwd: str, job_id: str) -> dict | None:
387
+ """Return a JobStatus dict, or None if the job does not exist."""
388
+ with _JOBS_LOCK:
389
+ live = _read_live_job(cwd, job_id)
390
+ if live is None:
391
+ return None
392
+ jd, meta, state = live
393
+ return _status_dict(jd, meta, state)
394
+
395
+
396
+ def _status_dict(jd: Path, meta: dict, state: str) -> dict:
397
+ cost = _terminal_cost(jd, state)
398
+ detail = None
399
+ if state == "failed":
400
+ detail = _stderr_tail(jd)
401
+ return {
402
+ "ok": True,
403
+ "job_id": meta.get("job_id", jd.name),
404
+ "kind": meta.get("kind", ""),
405
+ "status": state,
406
+ "started_at": meta.get("started_at", ""),
407
+ "elapsed_ms": _elapsed_ms(meta),
408
+ "deadline_seconds": _deadline_seconds(meta),
409
+ "poll_after_ms": poll_after_ms(),
410
+ "ttl_seconds": ttl_seconds(),
411
+ "expires_at": _expires_at(meta),
412
+ "result_available": state == "done",
413
+ "cost_usd": cost,
414
+ "detail": detail,
415
+ "fingerprint": FINGERPRINT,
416
+ }
417
+
418
+
419
+ def list_jobs(cwd: str) -> dict:
420
+ """Return a JobListResult dict of the workspace's known jobs, newest first.
421
+
422
+ Reaps first (like the other lifecycle calls), so listing can refresh statuses
423
+ and delete expired records — it is not strictly read-only."""
424
+ with _JOBS_LOCK:
425
+ _reap_workspace(cwd)
426
+ ws = _ws_dir(cwd)
427
+ summaries = []
428
+ if ws.is_dir():
429
+ for jd in ws.iterdir():
430
+ if not jd.is_dir():
431
+ continue
432
+ meta = _read_meta(jd)
433
+ if meta is None:
434
+ continue
435
+ state = _status_of(jd, meta)
436
+ summaries.append(
437
+ {
438
+ "_epoch": meta.get("started_epoch", 0.0),
439
+ "job_id": meta.get("job_id", jd.name),
440
+ "kind": meta.get("kind", ""),
441
+ "status": state,
442
+ "started_at": meta.get("started_at", ""),
443
+ "elapsed_ms": _elapsed_ms(meta),
444
+ "result_available": state == "done",
445
+ "expires_at": _expires_at(meta),
446
+ "cost_usd": _terminal_cost(jd, state),
447
+ }
448
+ )
449
+ summaries.sort(key=lambda s: s["_epoch"], reverse=True) # newest first
450
+ for s in summaries:
451
+ s.pop("_epoch", None)
452
+ return {"ok": True, "jobs": summaries, "fingerprint": FINGERPRINT}
453
+
454
+
455
+ def _stderr_tail(jd: Path, limit: int = 200) -> str | None:
456
+ try:
457
+ text = (jd / "stderr.log").read_text().strip()
458
+ except OSError:
459
+ return None
460
+ return text[-limit:] or None
461
+
462
+
463
+ def result(cwd: str, job_id: str, consume: bool = False):
464
+ """Return (payload, found). payload is the normalized SuccessResult|ErrorResult
465
+ dict; found is False when no such job exists."""
466
+ with _JOBS_LOCK:
467
+ live = _read_live_job(cwd, job_id)
468
+ if live is None:
469
+ return None, False
470
+ jd, meta, state = live
471
+ if state == "done":
472
+ env_text = (jd / "result.json").read_text()
473
+ summary = meta.get("context_summary")
474
+ ctx_summary = ContextSummary(**summary) if summary else None
475
+ payload = normalize_envelope(
476
+ meta.get("kind", "claude_review_changes"),
477
+ env_text,
478
+ _build_meta(meta),
479
+ detail=meta.get("config", {}).get("detail", "summary"),
480
+ context_summary=ctx_summary,
481
+ )
482
+ if consume:
483
+ _rmtree(jd)
484
+ return payload, True
485
+ # Non-done states map to an error envelope so the contract stays ok-discriminated.
486
+ payload = _job_error(meta, state, jd)
487
+ return payload, True
488
+
489
+
490
+ _STATE_TO_ERROR = {
491
+ "running": (
492
+ "job_running",
493
+ "The job is still running.",
494
+ "Poll claude_job_status; call claude_job_result once status=done.",
495
+ ),
496
+ "cancelled": (
497
+ "job_cancelled",
498
+ "The job was cancelled.",
499
+ "Start a new job; a cancelled run cannot be resumed.",
500
+ ),
501
+ "timeout": (
502
+ "job_timeout",
503
+ "The job exceeded its wall-clock deadline and was stopped.",
504
+ "Narrow the scope or raise CC_PLUGIN_CODEX_JOB_MAX_SECONDS, then start a new job.",
505
+ ),
506
+ }
507
+
508
+
509
+ def _job_error(meta: dict, state: str, jd: Path) -> dict:
510
+ if state == "failed":
511
+ tail = _stderr_tail(jd)
512
+ # A failed job whose stderr carries a drift signature is the async twin of
513
+ # the sync cli_contract_changed path — classify it the same way so async
514
+ # callers get the same actionable error instead of a generic job_failed.
515
+ if is_contract_drift(tail):
516
+ info = contract_changed_error()
517
+ code, message, repair, retryable = (
518
+ info.code,
519
+ info.message,
520
+ info.repair,
521
+ info.retryable,
522
+ )
523
+ else:
524
+ code, message, repair = (
525
+ "job_failed",
526
+ f"The job failed without producing a result. {tail or ''}".strip(),
527
+ "Run claude_status to check the CLI is installed and authenticated, then retry.",
528
+ )
529
+ retryable = True
530
+ else:
531
+ code, message, repair = _STATE_TO_ERROR.get(
532
+ state, ("job_failed", "The job did not complete.", "Start a new job.")
533
+ )
534
+ retryable = state == "running"
535
+ bmeta = _build_meta(meta)
536
+ # Surface any spend the (possibly partial) envelope recorded.
537
+ env = _read_envelope(jd)
538
+ if env:
539
+ apply_cost_usage(bmeta, env)
540
+ return ErrorResult(
541
+ error=ErrorInfo(
542
+ code=cast("ErrorCode", code), message=message, repair=repair, retryable=retryable
543
+ ),
544
+ meta=bmeta,
545
+ ).model_dump(mode="json", exclude_none=True)
546
+
547
+
548
+ def cancel(cwd: str, job_id: str) -> dict | None:
549
+ """Kill a running job and mark it cancelled. Returns a JobStatus dict or None."""
550
+ with _JOBS_LOCK:
551
+ live = _read_live_job(cwd, job_id)
552
+ if live is None:
553
+ return None
554
+ jd, meta, state = live
555
+ if state not in _TERMINAL:
556
+ _kill_pid_tree(meta.get("pid"))
557
+ meta["terminal_status"] = "cancelled"
558
+ meta["completed_epoch"] = time.time()
559
+ _write_meta(jd, meta)
560
+ state = "cancelled"
561
+ return _status_dict(jd, meta, state)