PyPI - flint-slating - Versions diffs - 0.1.0__py3-none-any.whl - Mend

flint-slating 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

flint_slating/__init__.py +0 -0
flint_slating/__main__.py +62 -0
flint_slating/app.py +41 -0
flint_slating/config.py +45 -0
flint_slating/images.py +100 -0
flint_slating/jobs.py +262 -0
flint_slating/mcp_server.py +26 -0
flint_slating/outputs.py +139 -0
flint_slating/pdf_reader.py +420 -0
flint_slating/pdf_source.py +184 -0
flint_slating/routes.py +175 -0
flint_slating/schema.py +40 -0
flint_slating/tables.py +91 -0
flint_slating/tools.py +364 -0
flint_slating-0.1.0.dist-info/METADATA +188 -0
flint_slating-0.1.0.dist-info/RECORD +19 -0
flint_slating-0.1.0.dist-info/WHEEL +4 -0
flint_slating-0.1.0.dist-info/entry_points.txt +2 -0
flint_slating-0.1.0.dist-info/licenses/LICENSE +21 -0

flint_slating/__init__.py ADDED Viewed

File without changes

flint_slating/__main__.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Entrypoint: `python -m flint_slating` -> uvicorn (default) or stdio.
+Mirrors deco-assaying's CLI shape:
+    flint-slating                      # HTTP (Streamable-HTTP MCP on PORT)
+    flint-slating --transport http     # same
+    flint-slating --transport stdio    # stdio MCP (for mcp.json integrations)
+"""
+from __future__ import annotations
+import argparse
+import logging
+import anyio
+import uvicorn
+from flint_slating import jobs, pdf_reader
+from flint_slating.config import HOST, PORT
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(prog="flint-slating", description="PDF-reading MCP server.")
+    parser.add_argument(
+        "--transport",
+        choices=["http", "stdio"],
+        default="http",
+        help="MCP transport to use (default: http)",
+    )
+    return parser.parse_args()
+async def _run_stdio() -> None:
+    from mcp.server.stdio import stdio_server
+    from flint_slating.mcp_server import mcp
+    jobs.set_transport_mode("stdio")
+    try:
+        pdf_reader.warm_docling()
+    except Exception as e:
+        logging.getLogger(__name__).warning("docling warmup failed: %s", e)
+    async with stdio_server() as (read_stream, write_stream):
+        await mcp.run(
+            read_stream,
+            write_stream,
+            mcp.create_initialization_options(),
+        )
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+    args = _parse_args()
+    if args.transport == "stdio":
+        anyio.run(_run_stdio)
+    else:
+        uvicorn.run("flint_slating.app:app", host=HOST, port=PORT)
+if __name__ == "__main__":
+    main()

flint_slating/app.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""FastAPI app construction for the HTTP transport.
+Logging is configured by the entry point; importing this module does not
+touch the root logger.
+"""
+from __future__ import annotations
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
+from starlette.routing import Route
+from flint_slating.config import VERSION
+from flint_slating.routes import lifespan, mcp_asgi, router
+app = FastAPI(
+    title="flint-slating",
+    version=VERSION,
+    description=(
+        "PDF-reading MCP server. The /sse endpoint exposes the MCP "
+        "Streamable-HTTP transport; /outputs/{job_id}/* serves finished "
+        "job artifacts."
+    ),
+    lifespan=lifespan,
+    docs_url="/docs",
+    redoc_url="/redoc",
+    openapi_url="/openapi.json",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=256)
+# Streamable HTTP MCP transport at /sse, mounted as raw ASGI3 so
+# Starlette doesn't wrap SSE in request_response (which would break
+# streaming).
+app.router.routes.append(Route("/sse", endpoint=mcp_asgi, methods=["GET", "POST", "DELETE"]))
+app.include_router(router)

flint_slating/config.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Static configuration. Pure leaf module — no internal imports."""
+from __future__ import annotations
+import os
+from importlib.metadata import PackageNotFoundError, version
+from pathlib import Path
+try:
+    VERSION: str = version("flint-slating")
+except PackageNotFoundError:
+    VERSION = "0.0.0+local"
+PORT = int(os.environ.get("PORT", "35833"))
+HOST = os.environ.get("HOST", "0.0.0.0")
+PUBLIC_BASE_URL: str = (os.environ.get("PUBLIC_BASE_URL") or f"http://localhost:{PORT}").rstrip("/")
+OUTPUT_ROOT: Path = Path(os.environ.get("OUTPUT_ROOT", "./output")).resolve()
+CACHE_ROOT: Path = Path(os.environ.get("CACHE_ROOT", "./cache")).resolve()
+OUTPUT_EXPIRY_DAYS: int = int(os.environ.get("OUTPUT_EXPIRY_DAYS", "7"))
+JOB_HISTORY_MAX: int = int(os.environ.get("JOB_HISTORY_MAX", "100"))
+# Inline-source caps. Anything bigger than these gets refused at the
+# pdf_source boundary — we never put a 1 GB base64 blob through the MCP
+# transport.
+MAX_INLINE_PDF_BYTES: int = int(os.environ.get("MAX_INLINE_PDF_BYTES", str(25 * 1024 * 1024)))
+MAX_URL_PDF_BYTES: int = int(os.environ.get("MAX_URL_PDF_BYTES", str(200 * 1024 * 1024)))
+# Page count under which Docling-backed tools run inline (sync) rather
+# than queuing a job. OCR runs always queue, regardless of page count.
+SYNC_PAGE_THRESHOLD: int = int(os.environ.get("SYNC_PAGE_THRESHOLD", "20"))
+# Where Docling stores its layout model. We export this into the process
+# environment too so docling itself picks it up.
+DOCLING_ARTIFACTS_PATH: Path = Path(
+    os.environ.get("DOCLING_ARTIFACTS_PATH") or (Path.home() / ".cache" / "docling")
+).resolve()
+os.environ.setdefault("DOCLING_ARTIFACTS_PATH", str(DOCLING_ARTIFACTS_PATH))
+ENABLE_OCR: bool = os.environ.get("ENABLE_OCR", "false").lower() in {"1", "true", "yes", "on"}
+# Image extraction caps to keep response payloads sane.
+MAX_IMAGE_EXTRACT_BYTES: int = int(os.environ.get("MAX_IMAGE_EXTRACT_BYTES", str(8 * 1024 * 1024)))

flint_slating/images.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Image listing and extraction.
+Backed by pypdf — Docling's image objects don't carry the raw stream
+bytes in a stable way across versions, but pypdf gives us direct xref
+access. Permissive license (BSD-3), already in the dep tree.
+"""
+from __future__ import annotations
+import base64
+from pathlib import Path
+from typing import Any
+from flint_slating import config
+from flint_slating.pdf_reader import EncryptedPdfError, PdfError, _open_pypdf
+def list_images(path: Path, password: str | None = None) -> dict[str, Any]:
+    """Return a flat list of image stubs across the document.
+    Each entry: `{page, index, name, width, height, ext}`. The (page,
+    index) pair is enough to extract the bytes via `extract_image`.
+    """
+    reader = _open_pypdf(path, password)
+    if reader.is_encrypted:
+        raise EncryptedPdfError("PDF is encrypted; provide a password")
+    out: list[dict[str, Any]] = []
+    for pno, page in enumerate(reader.pages, start=1):
+        try:
+            images = page.images  # pypdf returns a list of ImageFile
+        except Exception as e:
+            raise PdfError(f"image enumeration failed on page {pno}: {e}") from e
+        for idx, img in enumerate(images):
+            out.append(
+                {
+                    "page": pno,
+                    "index": idx,
+                    "name": getattr(img, "name", "") or "",
+                    "width": _img_dim(img, "width"),
+                    "height": _img_dim(img, "height"),
+                    "ext": _ext_from_name(getattr(img, "name", "") or ""),
+                }
+            )
+    return {"images": out}
+def extract_image(
+    path: Path,
+    *,
+    page: int,
+    index: int,
+    password: str | None = None,
+) -> dict[str, Any]:
+    """Extract one image's raw bytes by (1-based page, 0-based index).
+    Returns base64-encoded data and the original extension. Capped by
+    `config.MAX_IMAGE_EXTRACT_BYTES`.
+    """
+    reader = _open_pypdf(path, password)
+    if reader.is_encrypted:
+        raise EncryptedPdfError("PDF is encrypted; provide a password")
+    if page < 1 or page > len(reader.pages):
+        raise PdfError(f"page {page} out of range (1..{len(reader.pages)})")
+    images = reader.pages[page - 1].images
+    if index < 0 or index >= len(images):
+        raise PdfError(f"image index {index} out of range (0..{len(images) - 1})")
+    img = images[index]
+    raw = bytes(img.data or b"")
+    if len(raw) == 0:
+        raise PdfError("image has no data")
+    if len(raw) > config.MAX_IMAGE_EXTRACT_BYTES:
+        raise PdfError(
+            f"image size {len(raw)} exceeds MAX_IMAGE_EXTRACT_BYTES "
+            f"({config.MAX_IMAGE_EXTRACT_BYTES})"
+        )
+    return {
+        "page": page,
+        "index": index,
+        "ext": _ext_from_name(getattr(img, "name", "") or ""),
+        "size": len(raw),
+        "data_b64": base64.b64encode(raw).decode("ascii"),
+    }
+def _img_dim(img: Any, attr: str) -> int | None:
+    try:
+        pil = getattr(img, "image", None)
+        if pil is not None:
+            return int(getattr(pil, attr.replace("width", "width").replace("height", "height")))
+    except Exception:
+        pass
+    return None
+def _ext_from_name(name: str) -> str:
+    name = name.lower()
+    for ext in (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".gif"):
+        if name.endswith(ext):
+            return ext.lstrip(".")
+    return ""

flint_slating/jobs.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Background jobs for big PDF conversions.
+A job runs the full Docling pipeline on a single PDF and writes the
+result to `OUTPUT_ROOT/{job_id}/`. The hybrid sync/async split lives in
+`tools.py` — small PDFs are converted inline; large ones (`page_count >
+SYNC_PAGE_THRESHOLD`) call `start_read_markdown` or `start_read_chunks`
+and get a `job_id` back immediately.
+Workers run on a single background thread per job, daemon-flagged so the
+process can exit cleanly. There's no process pool here — Docling already
+holds a per-process layout model, and parallel runs would step on its
+cache.
+"""
+from __future__ import annotations
+import json
+import logging
+import threading
+import time
+import traceback
+import uuid
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any, Literal
+from flint_slating import config, outputs, pdf_reader, pdf_source
+log = logging.getLogger(__name__)
+_lock = threading.Lock()
+_jobs: OrderedDict[str, dict[str, Any]] = OrderedDict()
+JobKind = Literal["read_markdown", "read_chunks"]
+_TERMINAL_STATES = frozenset({"done", "failed", "cancelled"})
+# Whether jobs should emit HTTP-style result URLs (set by the entry
+# point). Stdio mode flips this off — there's no HTTP server backing
+# `/outputs/{id}/...`.
+_mode: Literal["stdio", "http"] = "http"
+def set_transport_mode(mode: Literal["stdio", "http"]) -> None:
+    global _mode
+    _mode = mode
+def transport_mode() -> Literal["stdio", "http"]:
+    return _mode
+# ---------------------------------------------------------------------------
+# Public entry points (called from tools.py)
+def start_read_markdown(
+    *,
+    resolved: pdf_source.Resolved,
+    pages: list[int] | None,
+    password: str | None,
+) -> tuple[str, Path]:
+    return _start_job(
+        kind="read_markdown",
+        resolved=resolved,
+        pages=pages,
+        password=password,
+    )
+def start_read_chunks(
+    *,
+    resolved: pdf_source.Resolved,
+    pages: list[int] | None,
+    password: str | None,
+) -> tuple[str, Path]:
+    return _start_job(
+        kind="read_chunks",
+        resolved=resolved,
+        pages=pages,
+        password=password,
+    )
+def get_status(job_id: str) -> dict[str, Any] | None:
+    with _lock:
+        job = _jobs.get(job_id)
+        if job is None:
+            return None
+        return _public_view(job)
+def cancel(job_id: str) -> bool:
+    with _lock:
+        job = _jobs.get(job_id)
+        if job is None:
+            return False
+        if job["status"] in _TERMINAL_STATES:
+            return True
+        job["_cancel"] = True
+        return True
+def is_active(job_id: str) -> bool:
+    with _lock:
+        job = _jobs.get(job_id)
+        return bool(job and job["status"] not in _TERMINAL_STATES)
+def drop(job_id: str) -> bool:
+    with _lock:
+        if job_id in _jobs:
+            del _jobs[job_id]
+            return True
+        return False
+def list_jobs(*, limit: int = 100, status: str | None = None) -> list[dict[str, Any]]:
+    limit = max(1, min(limit, max(1, config.JOB_HISTORY_MAX)))
+    with _lock:
+        snapshots = [_public_view(j) for j in reversed(list(_jobs.values()))]
+    if status:
+        snapshots = [s for s in snapshots if s["state"] == status]
+    return snapshots[:limit]
+def wait_for_result(job_id: str, timeout_seconds: float = 1800.0) -> dict[str, Any]:
+    """Block until `job_id` reaches a terminal state. Used by the stdio
+    transport to keep the request inline (no separate `get_job_*` calls)."""
+    deadline = time.time() + timeout_seconds
+    while True:
+        snap = get_status(job_id)
+        if snap is None:
+            return {"error": "unknown_job_id"}
+        if snap["state"] in _TERMINAL_STATES:
+            return snap
+        if time.time() > deadline:
+            return {"error": "timeout", "snapshot": snap}
+        time.sleep(0.25)
+# ---------------------------------------------------------------------------
+# Internals
+def _start_job(
+    *,
+    kind: JobKind,
+    resolved: pdf_source.Resolved,
+    pages: list[int] | None,
+    password: str | None,
+) -> tuple[str, Path]:
+    job_id = uuid.uuid4().hex[:16]
+    job_dir = outputs.prepare_job_dir(job_id)
+    now = time.time()
+    job: dict[str, Any] = {
+        "job_id": job_id,
+        "kind": kind,
+        "source_path": str(resolved.path),
+        "sha256": resolved.sha256,
+        "size": resolved.size,
+        "pages": pages or [],
+        "password": password,  # not serialized to disk; only kept in-memory
+        "job_dir": str(job_dir),
+        "status": "pending",
+        "started_at": now,
+        "finished_at": None,
+        "error": None,
+        "_cancel": False,
+    }
+    with _lock:
+        _jobs[job_id] = job
+        _evict_if_full(now_inserting_id=job_id)
+    threading.Thread(target=_run, args=(job_id,), name=f"pdf-job-{job_id}", daemon=True).start()
+    return job_id, job_dir
+def _evict_if_full(*, now_inserting_id: str) -> None:
+    cap = max(1, config.JOB_HISTORY_MAX)
+    while len(_jobs) > cap:
+        for jid, job in _jobs.items():
+            if jid != now_inserting_id and job["status"] in _TERMINAL_STATES:
+                del _jobs[jid]
+                break
+        else:
+            return
+def _set_status(job_id: str, status: str, *, error: str | None = None) -> None:
+    with _lock:
+        job = _jobs.get(job_id)
+        if job is None:
+            return
+        job["status"] = status
+        if status in _TERMINAL_STATES:
+            job["finished_at"] = time.time()
+        if error is not None:
+            job["error"] = error
+def _run(job_id: str) -> None:
+    with _lock:
+        job = _jobs[job_id]
+        kind: JobKind = job["kind"]
+        path = Path(job["source_path"])
+        pages = job["pages"] or None
+        password = job["password"]
+        job_dir = Path(job["job_dir"])
+    log_path = job_dir / "log.jsonl"
+    _emit(log_path, {"event": "started", "kind": kind})
+    _set_status(job_id, "running")
+    try:
+        if kind == "read_markdown":
+            result = pdf_reader.read_markdown(path, pages=pages, password=password)
+            outputs.write_result_markdown(job_dir, result["markdown"])
+            outputs.write_result_json(job_dir, json.dumps({"page_count": result["page_count"]}))
+        elif kind == "read_chunks":
+            result = pdf_reader.read_chunks(path, pages=pages, password=password)
+            outputs.write_result_json(job_dir, json.dumps(result))
+        else:
+            raise ValueError(f"unknown job kind: {kind}")
+        _emit(log_path, {"event": "done"})
+        _set_status(job_id, "done")
+    except pdf_reader.EncryptedPdfError as e:
+        _emit(log_path, {"event": "failed", "error": str(e)})
+        _set_status(job_id, "failed", error=f"encrypted: {e}")
+    except Exception as e:
+        log.exception("job %s failed", job_id)
+        _emit(log_path, {"event": "failed", "error": str(e)})
+        _set_status(
+            job_id,
+            "failed",
+            error=f"{type(e).__name__}: {e}\n{traceback.format_exc()}",
+        )
+def _public_view(job: dict[str, Any]) -> dict[str, Any]:
+    job_id = job["job_id"]
+    state = job["status"]
+    output_url: str | None = None
+    if state == "done" and _mode == "http":
+        suffix = "result.md" if job["kind"] == "read_markdown" else "result.json"
+        output_url = f"{config.PUBLIC_BASE_URL}/outputs/{job_id}/{suffix}"
+    return {
+        "job_id": job_id,
+        "kind": job["kind"],
+        "state": state,
+        "output_url": output_url,
+        "started_at": job["started_at"],
+        "finished_at": job["finished_at"],
+        "error": job["error"],
+    }
+def _emit(log_path: Path, event: dict[str, Any]) -> None:
+    line = {"ts": time.time(), **event}
+    try:
+        with open(log_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(line) + "\n")
+    except OSError:
+        pass

flint_slating/mcp_server.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Transport-agnostic MCP `Server` instance.
+Both the stdio entry (`stdio_entry.py`) and the HTTP routes (`routes.py`)
+import the `mcp` object from here. Tool registration happens at import
+time so either transport sees the same tool catalog.
+"""
+from __future__ import annotations
+from mcp import types
+from mcp.server import Server
+from flint_slating import tools
+from flint_slating.config import VERSION
+mcp: Server = Server("flint-slating", version=VERSION)
+@mcp.list_tools()
+async def _list_tools() -> list[types.Tool]:
+    return tools.TOOLS
+@mcp.call_tool()
+async def _call_tool(name: str, arguments: dict) -> list[types.TextContent]:
+    return await tools.dispatch(name, arguments)

flint_slating/outputs.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""Per-job output directory layout + path-safe artifact reads.
+Each job gets `config.OUTPUT_ROOT/{job_id}/` with:
+    result.md      — full Markdown (when produced)
+    result.json    — chunked / structured output (when produced)
+    log.jsonl      — append-only event log
+"""
+from __future__ import annotations
+import shutil
+import time
+from pathlib import Path
+from typing import Any, TypedDict
+from flint_slating import config
+class OutputError(Exception):
+    """Raised when a /outputs/... request can't be served safely."""
+class ArtifactMissing(OutputError):
+    """A specific artifact (e.g. result.md) hasn't been written yet."""
+class JobDirRow(TypedDict):
+    job_id: str
+    size: int
+    mtime: float
+def prepare_job_dir(job_id: str) -> Path:
+    """Allocate `OUTPUT_ROOT/{job_id}/` for a fresh job."""
+    root = config.OUTPUT_ROOT
+    root.mkdir(parents=True, exist_ok=True)
+    job_dir = (root / job_id).resolve(strict=False)
+    if not _is_under(job_dir, root.resolve()):
+        raise OutputError(f"refusing to create job dir outside OUTPUT_ROOT: {job_id!r}")
+    job_dir.mkdir(parents=True, exist_ok=True)
+    return job_dir
+def resolve_job_dir(job_id: str) -> Path | None:
+    """Find a job's output dir on disk under `config.OUTPUT_ROOT`."""
+    root = config.OUTPUT_ROOT.resolve()
+    candidate = (config.OUTPUT_ROOT / job_id).resolve(strict=False)
+    if candidate.is_dir() and _is_under(candidate, root):
+        return candidate
+    return None
+def safe_subpath(job_dir: Path, rel: str) -> Path:
+    """Resolve `rel` under `job_dir` and reject anything that escapes."""
+    cleaned = rel.lstrip("/").lstrip("\\")
+    if not cleaned:
+        return job_dir
+    target = (job_dir / cleaned).resolve(strict=False)
+    if not _is_under(target, job_dir.resolve()):
+        raise OutputError(f"path escapes job dir: {rel!r}")
+    return target
+def read_result_markdown(job_dir: Path) -> str:
+    target = job_dir / "result.md"
+    if not target.is_file():
+        raise ArtifactMissing("result.md not present")
+    return target.read_text(encoding="utf-8")
+def read_result_json(job_dir: Path) -> str:
+    target = job_dir / "result.json"
+    if not target.is_file():
+        raise ArtifactMissing("result.json not present")
+    return target.read_text(encoding="utf-8")
+def write_result_markdown(job_dir: Path, markdown: str) -> Path:
+    target = job_dir / "result.md"
+    target.write_text(markdown, encoding="utf-8")
+    return target
+def write_result_json(job_dir: Path, payload: str) -> Path:
+    target = job_dir / "result.json"
+    target.write_text(payload, encoding="utf-8")
+    return target
+def list_outputs_root() -> list[JobDirRow]:
+    root = config.OUTPUT_ROOT
+    if not root.is_dir():
+        return []
+    rows: list[JobDirRow] = []
+    for child in root.iterdir():
+        if not child.is_dir():
+            continue
+        size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file())
+        rows.append(JobDirRow(job_id=child.name, size=size, mtime=child.stat().st_mtime))
+    rows.sort(key=lambda r: r["mtime"], reverse=True)
+    return rows
+def remove_job_dir(job_dir: Path) -> None:
+    shutil.rmtree(job_dir, ignore_errors=False)
+def sweep_expired(now: float | None = None) -> int:
+    """Remove job dirs older than `OUTPUT_EXPIRY_DAYS`. Returns count removed."""
+    if config.OUTPUT_EXPIRY_DAYS <= 0:
+        return 0
+    now = now if now is not None else time.time()
+    cutoff = now - (config.OUTPUT_EXPIRY_DAYS * 86400)
+    removed = 0
+    if not config.OUTPUT_ROOT.is_dir():
+        return 0
+    for child in config.OUTPUT_ROOT.iterdir():
+        if not child.is_dir():
+            continue
+        try:
+            if child.stat().st_mtime < cutoff:
+                shutil.rmtree(child, ignore_errors=True)
+                removed += 1
+        except OSError:
+            continue
+    return removed
+def _is_under(child: Path, parent: Path) -> bool:
+    try:
+        child.relative_to(parent)
+    except ValueError:
+        return False
+    return True
+def _ensure_dict(_o: Any) -> dict[str, Any]:
+    """Stub for future schema validation hook."""
+    return _o