flint-slating 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,62 @@
1
+ """Entrypoint: `python -m flint_slating` -> uvicorn (default) or stdio.
2
+
3
+ Mirrors deco-assaying's CLI shape:
4
+
5
+ flint-slating # HTTP (Streamable-HTTP MCP on PORT)
6
+ flint-slating --transport http # same
7
+ flint-slating --transport stdio # stdio MCP (for mcp.json integrations)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import logging
14
+
15
+ import anyio
16
+ import uvicorn
17
+
18
+ from flint_slating import jobs, pdf_reader
19
+ from flint_slating.config import HOST, PORT
20
+
21
+
22
+ def _parse_args() -> argparse.Namespace:
23
+ parser = argparse.ArgumentParser(prog="flint-slating", description="PDF-reading MCP server.")
24
+ parser.add_argument(
25
+ "--transport",
26
+ choices=["http", "stdio"],
27
+ default="http",
28
+ help="MCP transport to use (default: http)",
29
+ )
30
+ return parser.parse_args()
31
+
32
+
33
+ async def _run_stdio() -> None:
34
+ from mcp.server.stdio import stdio_server
35
+
36
+ from flint_slating.mcp_server import mcp
37
+
38
+ jobs.set_transport_mode("stdio")
39
+ try:
40
+ pdf_reader.warm_docling()
41
+ except Exception as e:
42
+ logging.getLogger(__name__).warning("docling warmup failed: %s", e)
43
+
44
+ async with stdio_server() as (read_stream, write_stream):
45
+ await mcp.run(
46
+ read_stream,
47
+ write_stream,
48
+ mcp.create_initialization_options(),
49
+ )
50
+
51
+
52
+ def main() -> None:
53
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
54
+ args = _parse_args()
55
+ if args.transport == "stdio":
56
+ anyio.run(_run_stdio)
57
+ else:
58
+ uvicorn.run("flint_slating.app:app", host=HOST, port=PORT)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
flint_slating/app.py ADDED
@@ -0,0 +1,41 @@
1
+ """FastAPI app construction for the HTTP transport.
2
+
3
+ Logging is configured by the entry point; importing this module does not
4
+ touch the root logger.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from fastapi import FastAPI
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from fastapi.middleware.gzip import GZipMiddleware
12
+ from starlette.routing import Route
13
+
14
+ from flint_slating.config import VERSION
15
+ from flint_slating.routes import lifespan, mcp_asgi, router
16
+
17
+ app = FastAPI(
18
+ title="flint-slating",
19
+ version=VERSION,
20
+ description=(
21
+ "PDF-reading MCP server. The /sse endpoint exposes the MCP "
22
+ "Streamable-HTTP transport; /outputs/{job_id}/* serves finished "
23
+ "job artifacts."
24
+ ),
25
+ lifespan=lifespan,
26
+ docs_url="/docs",
27
+ redoc_url="/redoc",
28
+ openapi_url="/openapi.json",
29
+ )
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=["*"],
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+ app.add_middleware(GZipMiddleware, minimum_size=256)
37
+ # Streamable HTTP MCP transport at /sse, mounted as raw ASGI3 so
38
+ # Starlette doesn't wrap SSE in request_response (which would break
39
+ # streaming).
40
+ app.router.routes.append(Route("/sse", endpoint=mcp_asgi, methods=["GET", "POST", "DELETE"]))
41
+ app.include_router(router)
@@ -0,0 +1,45 @@
1
+ """Static configuration. Pure leaf module — no internal imports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from importlib.metadata import PackageNotFoundError, version
7
+ from pathlib import Path
8
+
9
+ try:
10
+ VERSION: str = version("flint-slating")
11
+ except PackageNotFoundError:
12
+ VERSION = "0.0.0+local"
13
+
14
+ PORT = int(os.environ.get("PORT", "35833"))
15
+ HOST = os.environ.get("HOST", "0.0.0.0")
16
+
17
+ PUBLIC_BASE_URL: str = (os.environ.get("PUBLIC_BASE_URL") or f"http://localhost:{PORT}").rstrip("/")
18
+
19
+ OUTPUT_ROOT: Path = Path(os.environ.get("OUTPUT_ROOT", "./output")).resolve()
20
+ CACHE_ROOT: Path = Path(os.environ.get("CACHE_ROOT", "./cache")).resolve()
21
+
22
+ OUTPUT_EXPIRY_DAYS: int = int(os.environ.get("OUTPUT_EXPIRY_DAYS", "7"))
23
+ JOB_HISTORY_MAX: int = int(os.environ.get("JOB_HISTORY_MAX", "100"))
24
+
25
+ # Inline-source caps. Anything bigger than these gets refused at the
26
+ # pdf_source boundary — we never put a 1 GB base64 blob through the MCP
27
+ # transport.
28
+ MAX_INLINE_PDF_BYTES: int = int(os.environ.get("MAX_INLINE_PDF_BYTES", str(25 * 1024 * 1024)))
29
+ MAX_URL_PDF_BYTES: int = int(os.environ.get("MAX_URL_PDF_BYTES", str(200 * 1024 * 1024)))
30
+
31
+ # Page count under which Docling-backed tools run inline (sync) rather
32
+ # than queuing a job. OCR runs always queue, regardless of page count.
33
+ SYNC_PAGE_THRESHOLD: int = int(os.environ.get("SYNC_PAGE_THRESHOLD", "20"))
34
+
35
+ # Where Docling stores its layout model. We export this into the process
36
+ # environment too so docling itself picks it up.
37
+ DOCLING_ARTIFACTS_PATH: Path = Path(
38
+ os.environ.get("DOCLING_ARTIFACTS_PATH") or (Path.home() / ".cache" / "docling")
39
+ ).resolve()
40
+ os.environ.setdefault("DOCLING_ARTIFACTS_PATH", str(DOCLING_ARTIFACTS_PATH))
41
+
42
+ ENABLE_OCR: bool = os.environ.get("ENABLE_OCR", "false").lower() in {"1", "true", "yes", "on"}
43
+
44
+ # Image extraction caps to keep response payloads sane.
45
+ MAX_IMAGE_EXTRACT_BYTES: int = int(os.environ.get("MAX_IMAGE_EXTRACT_BYTES", str(8 * 1024 * 1024)))
@@ -0,0 +1,100 @@
1
+ """Image listing and extraction.
2
+
3
+ Backed by pypdf — Docling's image objects don't carry the raw stream
4
+ bytes in a stable way across versions, but pypdf gives us direct xref
5
+ access. Permissive license (BSD-3), already in the dep tree.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from flint_slating import config
15
+ from flint_slating.pdf_reader import EncryptedPdfError, PdfError, _open_pypdf
16
+
17
+
18
+ def list_images(path: Path, password: str | None = None) -> dict[str, Any]:
19
+ """Return a flat list of image stubs across the document.
20
+
21
+ Each entry: `{page, index, name, width, height, ext}`. The (page,
22
+ index) pair is enough to extract the bytes via `extract_image`.
23
+ """
24
+ reader = _open_pypdf(path, password)
25
+ if reader.is_encrypted:
26
+ raise EncryptedPdfError("PDF is encrypted; provide a password")
27
+ out: list[dict[str, Any]] = []
28
+ for pno, page in enumerate(reader.pages, start=1):
29
+ try:
30
+ images = page.images # pypdf returns a list of ImageFile
31
+ except Exception as e:
32
+ raise PdfError(f"image enumeration failed on page {pno}: {e}") from e
33
+ for idx, img in enumerate(images):
34
+ out.append(
35
+ {
36
+ "page": pno,
37
+ "index": idx,
38
+ "name": getattr(img, "name", "") or "",
39
+ "width": _img_dim(img, "width"),
40
+ "height": _img_dim(img, "height"),
41
+ "ext": _ext_from_name(getattr(img, "name", "") or ""),
42
+ }
43
+ )
44
+ return {"images": out}
45
+
46
+
47
+ def extract_image(
48
+ path: Path,
49
+ *,
50
+ page: int,
51
+ index: int,
52
+ password: str | None = None,
53
+ ) -> dict[str, Any]:
54
+ """Extract one image's raw bytes by (1-based page, 0-based index).
55
+
56
+ Returns base64-encoded data and the original extension. Capped by
57
+ `config.MAX_IMAGE_EXTRACT_BYTES`.
58
+ """
59
+ reader = _open_pypdf(path, password)
60
+ if reader.is_encrypted:
61
+ raise EncryptedPdfError("PDF is encrypted; provide a password")
62
+ if page < 1 or page > len(reader.pages):
63
+ raise PdfError(f"page {page} out of range (1..{len(reader.pages)})")
64
+ images = reader.pages[page - 1].images
65
+ if index < 0 or index >= len(images):
66
+ raise PdfError(f"image index {index} out of range (0..{len(images) - 1})")
67
+ img = images[index]
68
+ raw = bytes(img.data or b"")
69
+ if len(raw) == 0:
70
+ raise PdfError("image has no data")
71
+ if len(raw) > config.MAX_IMAGE_EXTRACT_BYTES:
72
+ raise PdfError(
73
+ f"image size {len(raw)} exceeds MAX_IMAGE_EXTRACT_BYTES "
74
+ f"({config.MAX_IMAGE_EXTRACT_BYTES})"
75
+ )
76
+ return {
77
+ "page": page,
78
+ "index": index,
79
+ "ext": _ext_from_name(getattr(img, "name", "") or ""),
80
+ "size": len(raw),
81
+ "data_b64": base64.b64encode(raw).decode("ascii"),
82
+ }
83
+
84
+
85
+ def _img_dim(img: Any, attr: str) -> int | None:
86
+ try:
87
+ pil = getattr(img, "image", None)
88
+ if pil is not None:
89
+ return int(getattr(pil, attr.replace("width", "width").replace("height", "height")))
90
+ except Exception:
91
+ pass
92
+ return None
93
+
94
+
95
+ def _ext_from_name(name: str) -> str:
96
+ name = name.lower()
97
+ for ext in (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".gif"):
98
+ if name.endswith(ext):
99
+ return ext.lstrip(".")
100
+ return ""
flint_slating/jobs.py ADDED
@@ -0,0 +1,262 @@
1
+ """Background jobs for big PDF conversions.
2
+
3
+ A job runs the full Docling pipeline on a single PDF and writes the
4
+ result to `OUTPUT_ROOT/{job_id}/`. The hybrid sync/async split lives in
5
+ `tools.py` — small PDFs are converted inline; large ones (`page_count >
6
+ SYNC_PAGE_THRESHOLD`) call `start_read_markdown` or `start_read_chunks`
7
+ and get a `job_id` back immediately.
8
+
9
+ Workers run on a single background thread per job, daemon-flagged so the
10
+ process can exit cleanly. There's no process pool here — Docling already
11
+ holds a per-process layout model, and parallel runs would step on its
12
+ cache.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import logging
19
+ import threading
20
+ import time
21
+ import traceback
22
+ import uuid
23
+ from collections import OrderedDict
24
+ from pathlib import Path
25
+ from typing import Any, Literal
26
+
27
+ from flint_slating import config, outputs, pdf_reader, pdf_source
28
+
29
+ log = logging.getLogger(__name__)
30
+
31
+ _lock = threading.Lock()
32
+ _jobs: OrderedDict[str, dict[str, Any]] = OrderedDict()
33
+
34
+ JobKind = Literal["read_markdown", "read_chunks"]
35
+ _TERMINAL_STATES = frozenset({"done", "failed", "cancelled"})
36
+
37
+ # Whether jobs should emit HTTP-style result URLs (set by the entry
38
+ # point). Stdio mode flips this off — there's no HTTP server backing
39
+ # `/outputs/{id}/...`.
40
+ _mode: Literal["stdio", "http"] = "http"
41
+
42
+
43
+ def set_transport_mode(mode: Literal["stdio", "http"]) -> None:
44
+ global _mode
45
+ _mode = mode
46
+
47
+
48
+ def transport_mode() -> Literal["stdio", "http"]:
49
+ return _mode
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Public entry points (called from tools.py)
54
+
55
+
56
+ def start_read_markdown(
57
+ *,
58
+ resolved: pdf_source.Resolved,
59
+ pages: list[int] | None,
60
+ password: str | None,
61
+ ) -> tuple[str, Path]:
62
+ return _start_job(
63
+ kind="read_markdown",
64
+ resolved=resolved,
65
+ pages=pages,
66
+ password=password,
67
+ )
68
+
69
+
70
+ def start_read_chunks(
71
+ *,
72
+ resolved: pdf_source.Resolved,
73
+ pages: list[int] | None,
74
+ password: str | None,
75
+ ) -> tuple[str, Path]:
76
+ return _start_job(
77
+ kind="read_chunks",
78
+ resolved=resolved,
79
+ pages=pages,
80
+ password=password,
81
+ )
82
+
83
+
84
+ def get_status(job_id: str) -> dict[str, Any] | None:
85
+ with _lock:
86
+ job = _jobs.get(job_id)
87
+ if job is None:
88
+ return None
89
+ return _public_view(job)
90
+
91
+
92
+ def cancel(job_id: str) -> bool:
93
+ with _lock:
94
+ job = _jobs.get(job_id)
95
+ if job is None:
96
+ return False
97
+ if job["status"] in _TERMINAL_STATES:
98
+ return True
99
+ job["_cancel"] = True
100
+ return True
101
+
102
+
103
+ def is_active(job_id: str) -> bool:
104
+ with _lock:
105
+ job = _jobs.get(job_id)
106
+ return bool(job and job["status"] not in _TERMINAL_STATES)
107
+
108
+
109
+ def drop(job_id: str) -> bool:
110
+ with _lock:
111
+ if job_id in _jobs:
112
+ del _jobs[job_id]
113
+ return True
114
+ return False
115
+
116
+
117
+ def list_jobs(*, limit: int = 100, status: str | None = None) -> list[dict[str, Any]]:
118
+ limit = max(1, min(limit, max(1, config.JOB_HISTORY_MAX)))
119
+ with _lock:
120
+ snapshots = [_public_view(j) for j in reversed(list(_jobs.values()))]
121
+ if status:
122
+ snapshots = [s for s in snapshots if s["state"] == status]
123
+ return snapshots[:limit]
124
+
125
+
126
+ def wait_for_result(job_id: str, timeout_seconds: float = 1800.0) -> dict[str, Any]:
127
+ """Block until `job_id` reaches a terminal state. Used by the stdio
128
+ transport to keep the request inline (no separate `get_job_*` calls)."""
129
+ deadline = time.time() + timeout_seconds
130
+ while True:
131
+ snap = get_status(job_id)
132
+ if snap is None:
133
+ return {"error": "unknown_job_id"}
134
+ if snap["state"] in _TERMINAL_STATES:
135
+ return snap
136
+ if time.time() > deadline:
137
+ return {"error": "timeout", "snapshot": snap}
138
+ time.sleep(0.25)
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Internals
143
+
144
+
145
+ def _start_job(
146
+ *,
147
+ kind: JobKind,
148
+ resolved: pdf_source.Resolved,
149
+ pages: list[int] | None,
150
+ password: str | None,
151
+ ) -> tuple[str, Path]:
152
+ job_id = uuid.uuid4().hex[:16]
153
+ job_dir = outputs.prepare_job_dir(job_id)
154
+ now = time.time()
155
+ job: dict[str, Any] = {
156
+ "job_id": job_id,
157
+ "kind": kind,
158
+ "source_path": str(resolved.path),
159
+ "sha256": resolved.sha256,
160
+ "size": resolved.size,
161
+ "pages": pages or [],
162
+ "password": password, # not serialized to disk; only kept in-memory
163
+ "job_dir": str(job_dir),
164
+ "status": "pending",
165
+ "started_at": now,
166
+ "finished_at": None,
167
+ "error": None,
168
+ "_cancel": False,
169
+ }
170
+ with _lock:
171
+ _jobs[job_id] = job
172
+ _evict_if_full(now_inserting_id=job_id)
173
+
174
+ threading.Thread(target=_run, args=(job_id,), name=f"pdf-job-{job_id}", daemon=True).start()
175
+ return job_id, job_dir
176
+
177
+
178
+ def _evict_if_full(*, now_inserting_id: str) -> None:
179
+ cap = max(1, config.JOB_HISTORY_MAX)
180
+ while len(_jobs) > cap:
181
+ for jid, job in _jobs.items():
182
+ if jid != now_inserting_id and job["status"] in _TERMINAL_STATES:
183
+ del _jobs[jid]
184
+ break
185
+ else:
186
+ return
187
+
188
+
189
+ def _set_status(job_id: str, status: str, *, error: str | None = None) -> None:
190
+ with _lock:
191
+ job = _jobs.get(job_id)
192
+ if job is None:
193
+ return
194
+ job["status"] = status
195
+ if status in _TERMINAL_STATES:
196
+ job["finished_at"] = time.time()
197
+ if error is not None:
198
+ job["error"] = error
199
+
200
+
201
+ def _run(job_id: str) -> None:
202
+ with _lock:
203
+ job = _jobs[job_id]
204
+ kind: JobKind = job["kind"]
205
+ path = Path(job["source_path"])
206
+ pages = job["pages"] or None
207
+ password = job["password"]
208
+ job_dir = Path(job["job_dir"])
209
+
210
+ log_path = job_dir / "log.jsonl"
211
+ _emit(log_path, {"event": "started", "kind": kind})
212
+ _set_status(job_id, "running")
213
+ try:
214
+ if kind == "read_markdown":
215
+ result = pdf_reader.read_markdown(path, pages=pages, password=password)
216
+ outputs.write_result_markdown(job_dir, result["markdown"])
217
+ outputs.write_result_json(job_dir, json.dumps({"page_count": result["page_count"]}))
218
+ elif kind == "read_chunks":
219
+ result = pdf_reader.read_chunks(path, pages=pages, password=password)
220
+ outputs.write_result_json(job_dir, json.dumps(result))
221
+ else:
222
+ raise ValueError(f"unknown job kind: {kind}")
223
+ _emit(log_path, {"event": "done"})
224
+ _set_status(job_id, "done")
225
+ except pdf_reader.EncryptedPdfError as e:
226
+ _emit(log_path, {"event": "failed", "error": str(e)})
227
+ _set_status(job_id, "failed", error=f"encrypted: {e}")
228
+ except Exception as e:
229
+ log.exception("job %s failed", job_id)
230
+ _emit(log_path, {"event": "failed", "error": str(e)})
231
+ _set_status(
232
+ job_id,
233
+ "failed",
234
+ error=f"{type(e).__name__}: {e}\n{traceback.format_exc()}",
235
+ )
236
+
237
+
238
+ def _public_view(job: dict[str, Any]) -> dict[str, Any]:
239
+ job_id = job["job_id"]
240
+ state = job["status"]
241
+ output_url: str | None = None
242
+ if state == "done" and _mode == "http":
243
+ suffix = "result.md" if job["kind"] == "read_markdown" else "result.json"
244
+ output_url = f"{config.PUBLIC_BASE_URL}/outputs/{job_id}/{suffix}"
245
+ return {
246
+ "job_id": job_id,
247
+ "kind": job["kind"],
248
+ "state": state,
249
+ "output_url": output_url,
250
+ "started_at": job["started_at"],
251
+ "finished_at": job["finished_at"],
252
+ "error": job["error"],
253
+ }
254
+
255
+
256
+ def _emit(log_path: Path, event: dict[str, Any]) -> None:
257
+ line = {"ts": time.time(), **event}
258
+ try:
259
+ with open(log_path, "a", encoding="utf-8") as f:
260
+ f.write(json.dumps(line) + "\n")
261
+ except OSError:
262
+ pass
@@ -0,0 +1,26 @@
1
+ """Transport-agnostic MCP `Server` instance.
2
+
3
+ Both the stdio entry (`stdio_entry.py`) and the HTTP routes (`routes.py`)
4
+ import the `mcp` object from here. Tool registration happens at import
5
+ time so either transport sees the same tool catalog.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from mcp import types
11
+ from mcp.server import Server
12
+
13
+ from flint_slating import tools
14
+ from flint_slating.config import VERSION
15
+
16
+ mcp: Server = Server("flint-slating", version=VERSION)
17
+
18
+
19
+ @mcp.list_tools()
20
+ async def _list_tools() -> list[types.Tool]:
21
+ return tools.TOOLS
22
+
23
+
24
+ @mcp.call_tool()
25
+ async def _call_tool(name: str, arguments: dict) -> list[types.TextContent]:
26
+ return await tools.dispatch(name, arguments)
@@ -0,0 +1,139 @@
1
+ """Per-job output directory layout + path-safe artifact reads.
2
+
3
+ Each job gets `config.OUTPUT_ROOT/{job_id}/` with:
4
+ result.md — full Markdown (when produced)
5
+ result.json — chunked / structured output (when produced)
6
+ log.jsonl — append-only event log
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import shutil
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Any, TypedDict
15
+
16
+ from flint_slating import config
17
+
18
+
19
+ class OutputError(Exception):
20
+ """Raised when a /outputs/... request can't be served safely."""
21
+
22
+
23
+ class ArtifactMissing(OutputError):
24
+ """A specific artifact (e.g. result.md) hasn't been written yet."""
25
+
26
+
27
+ class JobDirRow(TypedDict):
28
+ job_id: str
29
+ size: int
30
+ mtime: float
31
+
32
+
33
+ def prepare_job_dir(job_id: str) -> Path:
34
+ """Allocate `OUTPUT_ROOT/{job_id}/` for a fresh job."""
35
+ root = config.OUTPUT_ROOT
36
+ root.mkdir(parents=True, exist_ok=True)
37
+ job_dir = (root / job_id).resolve(strict=False)
38
+ if not _is_under(job_dir, root.resolve()):
39
+ raise OutputError(f"refusing to create job dir outside OUTPUT_ROOT: {job_id!r}")
40
+ job_dir.mkdir(parents=True, exist_ok=True)
41
+ return job_dir
42
+
43
+
44
+ def resolve_job_dir(job_id: str) -> Path | None:
45
+ """Find a job's output dir on disk under `config.OUTPUT_ROOT`."""
46
+ root = config.OUTPUT_ROOT.resolve()
47
+ candidate = (config.OUTPUT_ROOT / job_id).resolve(strict=False)
48
+ if candidate.is_dir() and _is_under(candidate, root):
49
+ return candidate
50
+ return None
51
+
52
+
53
+ def safe_subpath(job_dir: Path, rel: str) -> Path:
54
+ """Resolve `rel` under `job_dir` and reject anything that escapes."""
55
+ cleaned = rel.lstrip("/").lstrip("\\")
56
+ if not cleaned:
57
+ return job_dir
58
+ target = (job_dir / cleaned).resolve(strict=False)
59
+ if not _is_under(target, job_dir.resolve()):
60
+ raise OutputError(f"path escapes job dir: {rel!r}")
61
+ return target
62
+
63
+
64
+ def read_result_markdown(job_dir: Path) -> str:
65
+ target = job_dir / "result.md"
66
+ if not target.is_file():
67
+ raise ArtifactMissing("result.md not present")
68
+ return target.read_text(encoding="utf-8")
69
+
70
+
71
+ def read_result_json(job_dir: Path) -> str:
72
+ target = job_dir / "result.json"
73
+ if not target.is_file():
74
+ raise ArtifactMissing("result.json not present")
75
+ return target.read_text(encoding="utf-8")
76
+
77
+
78
+ def write_result_markdown(job_dir: Path, markdown: str) -> Path:
79
+ target = job_dir / "result.md"
80
+ target.write_text(markdown, encoding="utf-8")
81
+ return target
82
+
83
+
84
+ def write_result_json(job_dir: Path, payload: str) -> Path:
85
+ target = job_dir / "result.json"
86
+ target.write_text(payload, encoding="utf-8")
87
+ return target
88
+
89
+
90
+ def list_outputs_root() -> list[JobDirRow]:
91
+ root = config.OUTPUT_ROOT
92
+ if not root.is_dir():
93
+ return []
94
+ rows: list[JobDirRow] = []
95
+ for child in root.iterdir():
96
+ if not child.is_dir():
97
+ continue
98
+ size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file())
99
+ rows.append(JobDirRow(job_id=child.name, size=size, mtime=child.stat().st_mtime))
100
+ rows.sort(key=lambda r: r["mtime"], reverse=True)
101
+ return rows
102
+
103
+
104
+ def remove_job_dir(job_dir: Path) -> None:
105
+ shutil.rmtree(job_dir, ignore_errors=False)
106
+
107
+
108
+ def sweep_expired(now: float | None = None) -> int:
109
+ """Remove job dirs older than `OUTPUT_EXPIRY_DAYS`. Returns count removed."""
110
+ if config.OUTPUT_EXPIRY_DAYS <= 0:
111
+ return 0
112
+ now = now if now is not None else time.time()
113
+ cutoff = now - (config.OUTPUT_EXPIRY_DAYS * 86400)
114
+ removed = 0
115
+ if not config.OUTPUT_ROOT.is_dir():
116
+ return 0
117
+ for child in config.OUTPUT_ROOT.iterdir():
118
+ if not child.is_dir():
119
+ continue
120
+ try:
121
+ if child.stat().st_mtime < cutoff:
122
+ shutil.rmtree(child, ignore_errors=True)
123
+ removed += 1
124
+ except OSError:
125
+ continue
126
+ return removed
127
+
128
+
129
+ def _is_under(child: Path, parent: Path) -> bool:
130
+ try:
131
+ child.relative_to(parent)
132
+ except ValueError:
133
+ return False
134
+ return True
135
+
136
+
137
+ def _ensure_dict(_o: Any) -> dict[str, Any]:
138
+ """Stub for future schema validation hook."""
139
+ return _o