npm - @oriro/orirocli - Versions diffs - 0.1.7 → 0.1.9 - Mend

@oriro/orirocli 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (351) hide show

package/skills/graphify/llm.py ADDED Viewed

@@ -0,0 +1,1896 @@
+# Direct LLM backend for semantic extraction — supports Claude, Kimi K2.6,
+# Gemini, and OpenAI.
+# Used by `graphify extract . --backend gemini` and the benchmark scripts.
+# The default graphify pipeline uses Claude Code subagents via skill.md;
+# this module provides a direct API path for non-Claude-Code environments.
+from __future__ import annotations
+import base64
+import json
+import os
+import re
+import sys
+import time
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, replace
+from pathlib import Path
+# `_read_files` truncates each file at this many characters before joining into
+# the user message. Token estimates use the same cap so packing matches reality.
+_FILE_CHAR_CAP = 20_000
+# `_read_files` also wraps each file in a `=== {rel} ===\n...\n\n` separator;
+# this is roughly the per-file overhead in characters that the prompt adds.
+_PER_FILE_OVERHEAD_CHARS = 80
+# Coarse fallback used only when `tiktoken` is not installed. 1 token ≈ 4 chars
+# is the standard heuristic for English/code on BPE tokenizers.
+_CHARS_PER_TOKEN = 4
+def _get_tokenizer():
+    """Return a tiktoken encoder for accurate token counts, or None if tiktoken
+    is not installed. We use `cl100k_base` (GPT-4 / GPT-3.5-turbo) as a proxy:
+    Kimi-K2 ships a tiktoken-based tokenizer with very similar BPE behaviour,
+    and Claude's tokenizer has a comparable token-to-char ratio for prose/code.
+    Estimates only need to be within ~5%, not exact.
+    """
+    try:
+        import tiktoken
+    except ImportError:
+        return None
+    try:
+        return tiktoken.get_encoding("cl100k_base")
+    except Exception:  # network failure on first-use download, etc.
+        return None
+# Cached at import time. None if tiktoken is unavailable; consumers must handle.
+_TOKENIZER = _get_tokenizer()
+BACKENDS: dict[str, dict] = {
+    "claude": {
+        "base_url": "https://api.anthropic.com",
+        "default_model": "claude-sonnet-4-6",
+        "env_key": "ANTHROPIC_API_KEY",
+        "pricing": {"input": 3.0, "output": 15.0},  # USD per 1M tokens
+        "temperature": 0,
+        "max_tokens": 16384,
+        "vision": True,
+    },
+    "kimi": {
+        "base_url": "https://api.moonshot.ai/v1",
+        "default_model": "kimi-k2.6",
+        "env_key": "MOONSHOT_API_KEY",
+        # kimi-k2.6 is natively multimodal (MoonViT) and accepts the same
+        # OpenAI image_url data-URI block via Moonshot's compat endpoint.
+        "vision": True,
+        "pricing": {"input": 0.74, "output": 4.66},  # USD per 1M tokens
+        "temperature": None,  # kimi-k2.6 enforces its own fixed temperature; sending any value raises 400
+        "max_tokens": 16384,
+    },
+    "ollama": {
+        "base_url": os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434/v1"),
+        "default_model": os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b"),
+        "env_key": "OLLAMA_API_KEY",
+        "pricing": {"input": 0.0, "output": 0.0},
+        "temperature": 0,
+        "max_tokens": 16384,
+    },
+    "gemini": {
+        "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
+        "default_model": "gemini-3-flash-preview",
+        "env_keys": ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
+        "model_env_key": "GRAPHIFY_GEMINI_MODEL",
+        "pricing": {"input": 0.50, "output": 3.00},  # USD per 1M tokens
+        "temperature": 0,
+        "reasoning_effort": "low",
+        "max_completion_tokens": 16384,
+        "vision": True,
+    },
+    "openai": {
+        "base_url": "https://api.openai.com/v1",
+        "default_model": "gpt-4.1-mini",
+        "env_key": "OPENAI_API_KEY",
+        "model_env_key": "GRAPHIFY_OPENAI_MODEL",
+        "pricing": {"input": 0.40, "output": 1.60},  # USD per 1M tokens
+        "temperature": 0,
+        "vision": True,
+    },
+    "deepseek": {
+        "base_url": "https://api.deepseek.com",
+        "default_model": "deepseek-v4-flash",
+        "env_key": "DEEPSEEK_API_KEY",
+        "model_env_key": "GRAPHIFY_DEEPSEEK_MODEL",
+        "pricing": {"input": 0.14, "output": 0.28},  # USD per 1M tokens (v4-flash)
+        # deepseek-reasoner / thinking-mode models silently ignore temperature;
+        # deepseek-chat / v4-flash (non-thinking) accept 0-2. Safe to send 0.
+        "temperature": 0,
+        "max_tokens": 16384,
+    },
+    "azure": {
+        # Azure OpenAI Service — uses AzureOpenAI SDK client, not the standard
+        # OpenAI client, so it has its own call path (_call_azure).
+        # Required env vars: AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT.
+        # Optional: AZURE_OPENAI_API_VERSION (defaults to 2024-12-01-preview),
+        #           AZURE_OPENAI_DEPLOYMENT or GRAPHIFY_AZURE_MODEL (deployment name).
+        # base_url is intentionally absent — prevents accidental routing through
+        # _call_openai_compat, which requires it and uses the wrong SDK client class.
+        "default_model": os.environ.get("AZURE_OPENAI_DEPLOYMENT", os.environ.get("GRAPHIFY_AZURE_MODEL", "gpt-4o")),
+        "env_key": "AZURE_OPENAI_API_KEY",
+        "model_env_key": "GRAPHIFY_AZURE_MODEL",
+        "pricing": {"input": 2.50, "output": 10.00},  # USD per 1M tokens (gpt-4o; may mis-estimate other deployments)
+        "temperature": 0,
+        "max_tokens": 16384,
+    },
+    "bedrock": {
+        "default_model": "anthropic.claude-3-5-sonnet-20241022-v2:0",
+        "model_env_key": "GRAPHIFY_BEDROCK_MODEL",
+        "pricing": {"input": 3.0, "output": 15.0},  # USD per 1M tokens
+        "temperature": 0,
+        "max_tokens": 16384,
+        "vision": True,
+    },
+    "claude-cli": {
+        # Routes through the locally-installed `claude` CLI (Claude Code) using
+        # `-p --output-format json`. Authenticates via the user's existing
+        # Pro/Max subscription instead of a separate ANTHROPIC_API_KEY — costs
+        # are billed to the plan, not pay-as-you-go API credit.
+        "default_model": "claude-code-plan",
+        "pricing": {"input": 0.0, "output": 0.0},
+        "temperature": 0,
+        "max_tokens": 16384,
+        # Claude Code is multimodal; images are passed by path and read with the
+        # CLI's Read tool rather than as inline base64 (see `_call_claude_cli`).
+        "vision": True,
+    },
+}
+def _custom_providers_path(global_: bool = True) -> Path:
+    if global_:
+        return Path.home() / ".graphify" / "providers.json"
+    return Path(".graphify") / "providers.json"
+def provider_base_url_ok(base_url: str, name: str, *, warn: bool = True) -> bool:
+    """Structural safety check for a custom-provider base_url.
+    A custom provider receives the full corpus plus the user's API key, so its
+    base_url is an exfiltration channel. We deliberately do NOT run the ingest
+    SSRF guard here: that blocks private/internal IPs, which would wrongly reject
+    legitimate on-prem corporate LLM gateways. Instead we reject non-http(s)
+    schemes outright and warn loudly when the corpus would leave over plaintext
+    http to a non-loopback host. The primary control against trusting injected
+    config is the GRAPHIFY_ALLOW_LOCAL_PROVIDERS gate on project-local files.
+    """
+    from urllib.parse import urlparse
+    try:
+        parsed = urlparse(base_url)
+    except Exception:
+        if warn:
+            print(f"[graphify] WARNING: provider {name!r} has an unparseable base_url; ignoring.", file=sys.stderr)
+        return False
+    if parsed.scheme not in ("http", "https"):
+        if warn:
+            print(
+                f"[graphify] WARNING: provider {name!r} base_url scheme {parsed.scheme!r} is not "
+                "http/https; ignoring.",
+                file=sys.stderr,
+            )
+        return False
+    host = (parsed.hostname or "").lower()
+    is_loopback = host in ("localhost", "127.0.0.1", "::1") or host.startswith("127.")
+    if warn and parsed.scheme == "http" and not is_loopback:
+        print(
+            f"[graphify] WARNING: provider {name!r} sends your corpus to {host!r} over plaintext "
+            "http. Use https unless this is a trusted local endpoint.",
+            file=sys.stderr,
+        )
+    return True
+def _load_custom_providers() -> dict[str, dict]:
+    # A project-local ./.graphify/providers.json travels with a cloned or shared
+    # repo and defines where the corpus + API key are sent, so loading it
+    # silently is a corpus/key exfiltration vector. Require an explicit opt-in;
+    # the user's own global ~/.graphify/providers.json stays trusted.
+    local_path = _custom_providers_path(global_=False)
+    global_path = _custom_providers_path(global_=True)
+    allow_local = os.environ.get("GRAPHIFY_ALLOW_LOCAL_PROVIDERS", "").strip().lower() in ("1", "true", "yes")
+    if local_path.is_file() and not allow_local:
+        print(
+            f"[graphify] WARNING: ignoring project-local {local_path} (custom providers control "
+            "where your corpus and API key are sent). Set GRAPHIFY_ALLOW_LOCAL_PROVIDERS=1 to load it.",
+            file=sys.stderr,
+        )
+    providers: dict[str, dict] = {}
+    paths = [local_path, global_path] if allow_local else [global_path]
+    for path in paths:
+        if path.is_file():
+            try:
+                data = json.loads(path.read_text(encoding="utf-8"))
+                if isinstance(data, dict):
+                    for name, cfg in data.items():
+                        if not (isinstance(name, str) and isinstance(cfg, dict)):
+                            continue
+                        if name in BACKENDS or name in providers:
+                            continue
+                        if not provider_base_url_ok(str(cfg.get("base_url", "")), name):
+                            continue
+                        if "pricing" not in cfg:
+                            cfg = dict(cfg, pricing={"input": 0.0, "output": 0.0})
+                        providers[name] = cfg
+            except Exception:
+                pass
+    return providers
+BACKENDS.update(_load_custom_providers())
+def _resolve_max_tokens(default: int) -> int:
+    """Honour GRAPHIFY_MAX_OUTPUT_TOKENS env var override, else use backend default."""
+    raw = os.environ.get("GRAPHIFY_MAX_OUTPUT_TOKENS", "").strip()
+    if raw:
+        try:
+            v = int(raw)
+            if v > 0:
+                return v
+        except ValueError:
+            pass
+    return default
+def _resolve_api_timeout(default: float = 600.0) -> float:
+    """Honour GRAPHIFY_API_TIMEOUT env var override, else use default (seconds)."""
+    raw = os.environ.get("GRAPHIFY_API_TIMEOUT", "").strip()
+    if raw:
+        try:
+            v = float(raw)
+            if v > 0:
+                return v
+        except ValueError:
+            pass
+    return default
+_EXTRACTION_SYSTEM = """\
+You are a graphify semantic extraction agent. Extract a knowledge graph fragment from the files provided.
+Output ONLY valid JSON — no explanation, no markdown fences, no preamble.
+Rules:
+- EXTRACTED: relationship explicit in source (import, call, citation, reference)
+- INFERRED: reasonable inference (shared data structure, implied dependency)
+- AMBIGUOUS: uncertain — flag for review, do not omit
+Node ID format: lowercase, only [a-z0-9_], no dots or slashes.
+Format: {stem}_{entity} where stem = filename without extension, entity = symbol name (both normalised).
+Output exactly this schema:
+{"nodes":[{"id":"stem_entity","label":"Human Readable Name","file_type":"code|document|paper|image|rationale|concept","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[],"input_tokens":0,"output_tokens":0}
+"""
+_DEEP_EXTRACTION_SUFFIX = """\
+DEEP_MODE: include additional INFERRED edges only for concrete architectural
+signals (shared data contracts, explicit lifecycle coupling, or multi-step flow
+dependencies visible in the sources). Avoid broad conceptual similarity edges.
+Mark uncertain ones AMBIGUOUS instead of omitting.
+"""
+def _extraction_system(*, deep: bool = False) -> str:
+    """Return the semantic-extraction system prompt, optionally in deep mode."""
+    if not deep:
+        return _EXTRACTION_SYSTEM
+    return _EXTRACTION_SYSTEM + _DEEP_EXTRACTION_SUFFIX
+def _file_to_text(path: Path) -> str:
+    """Return a text-like file's content for the extraction prompt.
+    Most files are read directly. PDFs are binary, so reading them with
+    `read_text` yields garbage (the same failure images had); route them through
+    pypdf instead. A scanned PDF with no text layer extracts to an empty string,
+    which still produces a reference node rather than noise.
+    """
+    if path.suffix.lower() == ".pdf":
+        from graphify.detect import extract_pdf_text
+        return extract_pdf_text(path)
+    return path.read_text(encoding="utf-8", errors="replace")
+def _read_files(paths: list[Path], root: Path) -> str:
+    """Return file contents formatted for the extraction prompt."""
+    parts: list[str] = []
+    for p in paths:
+        try:
+            rel = p.relative_to(root)
+        except ValueError:
+            rel = p
+        try:
+            content = _file_to_text(p)
+        except OSError:
+            continue
+        parts.append(f"=== {rel} ===\n{content[:20000]}")
+    return "\n\n".join(parts)
+# ── Image (vision) handling ───────────────────────────────────────────────────
+# Raster image types a vision model can actually look at. `.svg` is intentionally
+# excluded: it is XML markup, so `_read_files` reads it as text (the model parses
+# the source directly), which is more useful than rasterising it. Before this,
+# every image was fed through `path.read_text(errors="replace")`, turning binary
+# pixels into garbage text — noise for API backends and an outright `exit 1` for
+# the claude-cli backend.
+_VISION_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
+_IMAGE_MEDIA_TYPES = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+}
+# Per-image byte ceiling. Anthropic caps a request at 32 MB and Bedrock images
+# at ~5 MB; 5 MB per image keeps every backend within limits. Oversized images
+# fall back to a text reference (the node is still created, just unseen).
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024
+# Flat token estimate per image for chunk packing. Vision models bill an image
+# at a roughly fixed cost regardless of file size, so estimating by byte size
+# (as the generic path does) would force every large PNG into its own chunk.
+_IMAGE_TOKEN_ESTIMATE = 1_600
+# Hard cap on images per chunk, independent of the token budget. A large
+# token budget would otherwise pack hundreds of images into one request —
+# past provider per-request image limits (Anthropic allows 100), and far too
+# many for the claude-cli Read-tool loop to work through. Keeps memory and
+# request size bounded on image-dense corpora.
+_MAX_IMAGES_PER_CHUNK = 20
+# Backends that read an image by file path (claude-cli's Read tool)
+# instead of inlining base64. They open the file themselves and downsample as
+# needed, so `_MAX_IMAGE_BYTES` does not apply and the bytes never need loading.
+_PATH_IMAGE_BACKENDS = {"claude-cli"}
+@dataclass
+class _ImageRef:
+    """A single image destined for a vision request.
+    `raw` is None when the image is unreadable or exceeds `_MAX_IMAGE_BYTES`, or
+    when the target backend has no vision support — in every such case the
+    renderers emit a text reference instead of pixels, so the image still
+    becomes a graph node.
+    """
+    path: Path        # absolute path (claude-cli reads it via the Read tool)
+    rel: str          # path relative to the corpus root (the node's source_file)
+    media_type: str   # e.g. "image/png"
+    raw: bytes | None
+    @property
+    def b64(self) -> str:
+        return base64.standard_b64encode(self.raw).decode("ascii") if self.raw else ""
+    @property
+    def bedrock_format(self) -> str:
+        # Converse wants a bare format token, not a media type.
+        return self.media_type.split("/", 1)[-1]
+def _is_vision_image(path: Path) -> bool:
+    return path.suffix.lower() in _VISION_IMAGE_EXTENSIONS
+def _partition_semantic_files(files: list[Path]) -> tuple[list[Path], list[Path]]:
+    """Split a chunk into (text-like files, raster-image files)."""
+    text_files = [f for f in files if not _is_vision_image(f)]
+    image_files = [f for f in files if _is_vision_image(f)]
+    return text_files, image_files
+def _build_image_refs(image_files: list[Path], root: Path, *, read_bytes: bool = True) -> list[_ImageRef]:
+    """Build `_ImageRef`s for raster images.
+    `read_bytes=True` (base64 backends) loads the pixels and drops any image over
+    `_MAX_IMAGE_BYTES` to a reference, because a base64 request body has a hard
+    size ceiling. `read_bytes=False` (path-based backends — claude-cli)
+    skips the read entirely: those backends open the file themselves and
+    downsample as needed, so there is no per-image size limit and no reason to
+    load (potentially tens of MB of) bytes that would never be used.
+    """
+    refs: list[_ImageRef] = []
+    for p in image_files:
+        try:
+            rel = str(p.relative_to(root))
+        except ValueError:
+            rel = str(p)
+        media = _IMAGE_MEDIA_TYPES.get(p.suffix.lower(), "image/png")
+        raw: bytes | None = None
+        if read_bytes:
+            try:
+                raw = p.read_bytes()
+            except OSError as exc:
+                print(f"[graphify] could not read image {rel}: {exc}", file=sys.stderr)
+                raw = None
+            if raw is not None and len(raw) > _MAX_IMAGE_BYTES:
+                print(
+                    f"[graphify] image {rel} is {len(raw) // 1024} KB, over the "
+                    f"{_MAX_IMAGE_BYTES // (1024 * 1024)} MB inline-image limit for this "
+                    "backend; sending it as a reference node without inline pixels.",
+                    file=sys.stderr,
+                )
+                raw = None
+        try:
+            abs_path = p.resolve()
+        except OSError:
+            abs_path = p
+        refs.append(_ImageRef(abs_path, rel, media, raw))
+    return refs
+def _strip_pixels(refs: list[_ImageRef]) -> list[_ImageRef]:
+    """Return refs with pixel data dropped (for non-vision backends)."""
+    return [replace(r, raw=None) for r in refs]
+def _backend_supports_vision(backend: str) -> bool:
+    """Whether `backend`'s configured model can see images.
+    Ollama is special-cased: its default model is text-only, so vision is
+    opt-in via GRAPHIFY_OLLAMA_VISION=1 once the user selects a vision model
+    (e.g. --model llama3.2-vision).
+    """
+    if backend == "ollama":
+        return os.environ.get("GRAPHIFY_OLLAMA_VISION", "").strip() == "1"
+    return bool(BACKENDS.get(backend, {}).get("vision", False))
+def _image_notes(refs: list[_ImageRef], *, with_paths: bool = False) -> str:
+    """Text block listing the images so the model emits one node per image.
+    Always included alongside the visual payload (and used on its own when the
+    backend can't see pixels), so an image becomes a graph node either way.
+    `with_paths=True` also lists the absolute path and asks the model to open it
+    with the Read tool — used by the claude-cli backend.
+    """
+    if not refs:
+        return ""
+    if with_paths:
+        header = (
+            "Use the Read tool to open and view each image file at the path below, "
+            "then emit one node per image"
+        )
+    else:
+        header = (
+            "The following image file(s) are attached as visual input. Emit one "
+            "node per image"
+        )
+    lines = [
+        "=== IMAGES ===",
+        f"{header} with \"file_type\":\"image\" and the listed source_file, a label "
+        "describing what it depicts (diagram, screenshot, chart, photo, UI, logo), "
+        "and edges to any code/doc nodes the image clearly references.",
+    ]
+    for i, r in enumerate(refs, 1):
+        note = f"[image {i}] source_file: {r.rel}"
+        if with_paths:
+            note += f"  path: {r.path}"
+        if r.raw is None and not with_paths:
+            note += " (not shown: unreadable or exceeds size limit)"
+        lines.append(note)
+    return "\n".join(lines)
+def _with_image_notes(user_message: str, refs: list[_ImageRef], *, with_paths: bool = False) -> str:
+    notes = _image_notes(refs, with_paths=with_paths)
+    if not notes:
+        return user_message
+    if not user_message.strip():
+        return notes
+    return f"{user_message}\n\n{notes}"
+def _anthropic_content(user_message: str, refs: list[_ImageRef]):
+    """Build the Anthropic `messages[].content` value (str, or block list with images)."""
+    blocks = [
+        {"type": "image", "source": {"type": "base64", "media_type": r.media_type, "data": r.b64}}
+        for r in refs
+        if r.raw
+    ]
+    text = _with_image_notes(user_message, refs)
+    if not blocks:
+        return text
+    return [*blocks, {"type": "text", "text": text}]
+def _openai_content(user_message: str, refs: list[_ImageRef]):
+    """Build the OpenAI-compatible user `content` value (str, or part list with images)."""
+    parts: list[dict] = [
+        {
+            "type": "image_url",
+            "image_url": {"url": f"data:{r.media_type};base64,{r.b64}", "detail": "auto"},
+        }
+        for r in refs
+        if r.raw
+    ]
+    text = _with_image_notes(user_message, refs)
+    if not parts:
+        return text
+    return [{"type": "text", "text": text}, *parts]
+def _bedrock_content(user_message: str, refs: list[_ImageRef]) -> list[dict]:
+    """Build the Bedrock Converse user content list (raw bytes, not base64)."""
+    content: list[dict] = [
+        {"image": {"format": r.bedrock_format, "source": {"bytes": r.raw}}}
+        for r in refs
+        if r.raw
+    ]
+    content.append({"text": _with_image_notes(user_message, refs)})
+    return content
+_LLM_JSON_MAX_BYTES = 10 * 1024 * 1024  # 10 MB hard cap before json.loads (F-016)
+def _parse_llm_json(raw: str) -> dict:
+    """Strip optional markdown fences and parse JSON. Returns empty fragment on failure.
+    Caps the input at `_LLM_JSON_MAX_BYTES` so a hostile or runaway model
+    response cannot exhaust memory inside `json.loads` (F-016).
+    """
+    if len(raw) > _LLM_JSON_MAX_BYTES:
+        print(
+            f"[graphify] LLM response exceeds {_LLM_JSON_MAX_BYTES} bytes "
+            f"({len(raw)} bytes); refusing to parse and dropping chunk.",
+            file=sys.stderr,
+        )
+        return {"nodes": [], "edges": [], "hyperedges": []}
+    # Strategy 1: strip whitespace, then handle markdown fences anywhere in the
+    # text (not only at offset 0 — the original code only stripped fences when
+    # `raw.startswith("```")`, missing the common case where Claude prepends a
+    # preamble like "Here's the extracted entities:\n\n```json\n{...}\n```").
+    stripped = raw.strip()
+    fence_start = stripped.find("```")
+    if fence_start != -1:
+        after_fence = stripped[fence_start + 3 :]
+        # Optional language tag (json, JSON, javascript, etc.) up to newline.
+        nl = after_fence.find("\n")
+        if nl != -1 and after_fence[:nl].strip().lower() in {"json", "javascript", "js", ""}:
+            after_fence = after_fence[nl + 1 :]
+        fence_end = after_fence.rfind("```")
+        if fence_end != -1:
+            stripped = after_fence[:fence_end].strip()
+        else:
+            stripped = after_fence.strip()
+    try:
+        return json.loads(stripped)
+    except json.JSONDecodeError:
+        pass
+    # Strategy 2: extract the first balanced JSON object found anywhere in
+    # the text. Handles the case where Claude wraps the JSON in prose without
+    # any markdown fence ("The extracted graph is { ... }. Hope this helps!").
+    start = stripped.find("{")
+    if start != -1:
+        depth = 0
+        in_string = False
+        escape = False
+        for i in range(start, len(stripped)):
+            ch = stripped[i]
+            if escape:
+                escape = False
+                continue
+            if ch == "\\":
+                escape = True
+                continue
+            if ch == '"':
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if ch == "{":
+                depth += 1
+            elif ch == "}":
+                depth -= 1
+                if depth == 0:
+                    try:
+                        return json.loads(stripped[start : i + 1])
+                    except json.JSONDecodeError:
+                        break
+    print(
+        f"[graphify] LLM returned invalid JSON, skipping chunk "
+        f"(first 200 chars: {raw[:200]!r})",
+        file=sys.stderr,
+    )
+    return {"nodes": [], "edges": [], "hyperedges": []}
+def _response_is_hollow(raw_content: str | None, parsed: dict) -> bool:
+    """Detect a successful HTTP response that yielded no usable extraction.
+    A local model under load (most often Ollama) can return HTTP 200 with an
+    empty / null `message.content`, with whitespace, or with a half-generated
+    JSON prefix that fails to parse. All of these collapse to a "successful"
+    call producing zero nodes and zero edges. Without this check the chunk
+    is silently dropped from the corpus because no exception is raised and
+    `finish_reason` is `"stop"` rather than `"length"`. By flagging the
+    result as hollow, callers can re-route it through the same bisection
+    path used for context-window overflow and `finish_reason="length"`.
+    """
+    if raw_content is None or not raw_content.strip():
+        return True
+    nodes = parsed.get("nodes")
+    edges = parsed.get("edges")
+    hyperedges = parsed.get("hyperedges")
+    return not nodes and not edges and not hyperedges
+def _backend_env_keys(backend: str) -> list[str]:
+    """Return accepted API-key environment variables for a backend."""
+    cfg = BACKENDS[backend]
+    keys = cfg.get("env_keys")
+    if keys:
+        return list(keys)
+    env_key = cfg.get("env_key")
+    if env_key:
+        return [env_key]
+    return []
+def _get_backend_api_key(backend: str) -> str:
+    """Return the first configured API key for backend, or an empty string."""
+    for env_key in _backend_env_keys(backend):
+        value = os.environ.get(env_key)
+        if value:
+            return value
+    return ""
+def _format_backend_env_keys(backend: str) -> str:
+    """Return user-facing accepted API-key variable names."""
+    keys = _backend_env_keys(backend)
+    return " or ".join(keys) if keys else "AWS_PROFILE or AWS_REGION"
+def _default_model_for_backend(backend: str) -> str:
+    """Return configured model override or backend default model."""
+    cfg = BACKENDS[backend]
+    model_env_key = cfg.get("model_env_key")
+    if model_env_key:
+        model = os.environ.get(model_env_key)
+        if model:
+            return model
+    return cfg["default_model"]
+def _backend_pkg_hint(pkg: str, extra: str) -> str:
+    """Package-missing message that works for the recommended `uv tool` install.
+    `uv tool install graphifyy` puts graphify in an isolated venv, so a plain
+    `pip install <pkg>` never reaches it - the friction a user hits when a
+    backend needs anthropic/openai/boto3 and the only advice was "pip install".
+    Point at the extra and the uv path first, then the pip/venv fallback.
+    """
+    return (
+        f"the '{pkg}' package is required for this backend but is not installed. "
+        f"Install it with:  uv tool install \"graphifyy[{extra}]\" --force  "
+        f"(uv tool), or  pip install {pkg}  (pip/venv install)."
+    )
+def _call_openai_compat(
+    base_url: str,
+    api_key: str,
+    model: str,
+    user_message: str,
+    temperature: float | None = 0,
+    reasoning_effort: str | None = None,
+    max_completion_tokens: int = 8192,
+    *,
+    backend: str = "",
+    deep_mode: bool = False,
+    images: list[_ImageRef] | None = None,
+) -> dict:
+    """Call any OpenAI-compatible API (Kimi, OpenAI, etc.) and return parsed JSON."""
+    try:
+        from openai import OpenAI
+    except ImportError as exc:
+        extra = backend if backend in ("kimi", "gemini", "openai", "ollama") else "openai"
+        raise ImportError(_backend_pkg_hint("openai", extra)) from exc
+    # Local backends (ollama, llama.cpp, vLLM) routinely take >60s for a
+    # single chunk on a large model — far longer than the openai SDK's
+    # default. Honour GRAPHIFY_API_TIMEOUT (seconds) for explicit override;
+    # default to 600s, which is long enough for a 31B model on a 16k chunk
+    # but still bounds runaway connections (issue #792 addendum).
+    client = OpenAI(api_key=api_key, base_url=base_url, timeout=_resolve_api_timeout())
+    kwargs: dict = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": _extraction_system(deep=deep_mode)},
+            {"role": "user", "content": _openai_content(user_message, images or [])},
+        ],
+        "max_completion_tokens": max_completion_tokens,
+    }
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+    if reasoning_effort is not None:
+        kwargs["reasoning_effort"] = reasoning_effort
+    # Kimi-k2.6 is a reasoning model — disable thinking so content isn't empty
+    if "moonshot" in base_url:
+        kwargs["extra_body"] = {"thinking": {"type": "disabled"}}
+    # Ollama defaults num_ctx to 2048 and silently truncates prompts larger
+    # than that — the symptom is hollow 200 OK responses after the first few
+    # chunks (#798). We derive num_ctx from the actual prompt size so we don't
+    # over-allocate KV-cache VRAM. Over-allocation (e.g. 128k slots for an 8k
+    # prompt on a 31B model) exhausts VRAM by chunk 4 and produces the same
+    # hollow-200 symptom — just from a different direction (#798 follow-up).
+    # Formula: actual input tokens + output cap + system prompt headroom.
+    # Capped at 131072 (enough for the default 60k token_budget); env var wins.
+    if backend == "ollama":
+        num_ctx_raw = os.environ.get("GRAPHIFY_OLLAMA_NUM_CTX", "").strip()
+        # Auto-derive num_ctx from actual chunk size regardless — used as the
+        # fallback and for the mismatch check below.
+        estimated_input = len(user_message) // _CHARS_PER_TOKEN + 400
+        auto_num_ctx = min(estimated_input + max_completion_tokens + 2000, 131072)
+        auto_num_ctx = max(auto_num_ctx, 8192)
+        if num_ctx_raw:
+            try:
+                num_ctx = int(num_ctx_raw)
+            except ValueError:
+                # Bad env var: fall through to auto-derivation (not 131072 —
+                # hardcoding the cap is what causes OOM on constrained VRAM).
+                print(
+                    f"[graphify] GRAPHIFY_OLLAMA_NUM_CTX={num_ctx_raw!r} is not a valid integer; "
+                    f"using auto-derived value ({auto_num_ctx}).",
+                    file=sys.stderr,
+                )
+                num_ctx = auto_num_ctx
+            else:
+                # Warn when the pinned value is smaller than the estimated input —
+                # Ollama silently truncates the prompt and returns empty responses.
+                if num_ctx < estimated_input:
+                    print(
+                        f"[graphify] warning: GRAPHIFY_OLLAMA_NUM_CTX={num_ctx} is smaller than "
+                        f"the estimated chunk input (~{estimated_input} tokens). Ollama will "
+                        f"silently truncate the prompt and return empty responses. "
+                        f"Try --token-budget {max(1024, num_ctx // 3)} or increase NUM_CTX.",
+                        file=sys.stderr,
+                    )
+        else:
+            # Estimate input tokens: user_message chars / 4 (standard BPE
+            # heuristic) + 400 for the system prompt, then add output headroom.
+            num_ctx = auto_num_ctx
+        keep_alive = os.environ.get("GRAPHIFY_OLLAMA_KEEP_ALIVE", "30m")
+        kwargs["extra_body"] = {"options": {"num_ctx": num_ctx}, "keep_alive": keep_alive}
+    resp = client.chat.completions.create(**kwargs)
+    if not resp.choices or resp.choices[0].message is None:
+        raise ValueError("LLM returned empty or filtered response")
+    raw_content = resp.choices[0].message.content
+    result = _parse_llm_json(raw_content or "{}")
+    result["input_tokens"] = resp.usage.prompt_tokens if resp.usage else 0
+    result["output_tokens"] = resp.usage.completion_tokens if resp.usage else 0
+    result["model"] = model
+    # `finish_reason == "length"` means the model hit max_completion_tokens
+    # mid-generation. The JSON we got back is truncated; callers should
+    # treat this as a signal to retry with smaller input.
+    result["finish_reason"] = resp.choices[0].finish_reason
+    # An overwhelmed local model (typically Ollama) can return HTTP 200 with
+    # empty / null content or unparseable half-generated JSON. The call looks
+    # successful, `finish_reason` is `"stop"`, and the chunk would be silently
+    # dropped from the corpus. Re-label as `"length"` so the adaptive retry
+    # layer bisects the chunk — same recovery as a true truncation.
+    if _response_is_hollow(raw_content, result) and result["finish_reason"] != "length":
+        print(
+            f"[graphify] {backend or 'backend'} returned a hollow response "
+            f"(content={'empty' if not (raw_content or '').strip() else 'no nodes/edges'}, "
+            f"output_tokens={result['output_tokens']}); "
+            "treating as truncation so adaptive retry can bisect the chunk.",
+            file=sys.stderr,
+        )
+        result["finish_reason"] = "length"
+    output_tokens = result["output_tokens"]
+    if output_tokens < 50 and backend == "ollama":
+        print(
+            "[graphify] warning: ollama returned very few tokens — likely causes: "
+            "(1) VRAM pressure: check `nvidia-smi` and reduce chunk size with "
+            "--token-budget (e.g. --token-budget 4096) or set "
+            "GRAPHIFY_OLLAMA_NUM_CTX to a smaller value; "
+            "(2) model too small for JSON instruction following — "
+            "try a larger model with --model (e.g. --model qwen2.5-coder:14b).",
+            file=sys.stderr,
+        )
+    return result
+def _call_claude(api_key: str, model: str, user_message: str, max_tokens: int = 8192, *, deep_mode: bool = False, images: list[_ImageRef] | None = None) -> dict:
+    """Call Anthropic Claude directly (not via OpenAI compat layer)."""
+    try:
+        import anthropic
+    except ImportError as exc:
+        raise ImportError(_backend_pkg_hint("anthropic", "anthropic")) from exc
+    client = anthropic.Anthropic(api_key=api_key, timeout=_resolve_api_timeout())
+    resp = client.messages.create(
+        model=model,
+        max_tokens=max_tokens,
+        system=_extraction_system(deep=deep_mode),
+        messages=[{"role": "user", "content": _anthropic_content(user_message, images or [])}],
+    )
+    raw_content = resp.content[0].text if resp.content else None
+    result = _parse_llm_json(raw_content or "{}")
+    result["input_tokens"] = resp.usage.input_tokens if resp.usage else 0
+    result["output_tokens"] = resp.usage.output_tokens if resp.usage else 0
+    result["model"] = model
+    # Normalise Anthropic's `stop_reason` to the OpenAI-compat `finish_reason`
+    # vocabulary so the adaptive-retry layer doesn't have to know which
+    # backend produced the result.
+    result["finish_reason"] = "length" if resp.stop_reason == "max_tokens" else "stop"
+    if _response_is_hollow(raw_content, result) and result["finish_reason"] != "length":
+        print(
+            "[graphify] claude returned a hollow response; treating as "
+            "truncation so adaptive retry can bisect the chunk.",
+            file=sys.stderr,
+        )
+        result["finish_reason"] = "length"
+    return result
+def _call_claude_cli(user_message: str, max_tokens: int = 8192, *, deep_mode: bool = False, images: list[_ImageRef] | None = None) -> dict:
+    """Call Claude via the locally-installed Claude Code CLI (`claude -p`).
+    Routes through the user's Claude Code subscription auth instead of a separate
+    ANTHROPIC_API_KEY. Useful for Pro/Max subscribers who don't want to provision
+    a pay-as-you-go API key just to run graphify's semantic pass.
+    Images are passed by absolute path rather than inline base64: the prompt asks
+    the model to open each one with its Read tool, and each containing directory
+    is allowlisted with `--add-dir` so the read is permitted.
+    """
+    import platform
+    import shutil
+    import subprocess
+    # On Windows, npm installs `claude` as both `claude.ps1` and `claude.cmd`
+    # alongside each other. When PATHEXT lists `.PS1` before `.CMD`,
+    # `shutil.which("claude")` returns `claude.ps1`, which `CreateProcess`
+    # cannot execute directly — it raises `[WinError 2] The system cannot
+    # find the file specified`. `claude.cmd` IS executable by CreateProcess,
+    # so prefer it explicitly on Windows. See issue #1072.
+    claude_cmd = "claude"
+    if platform.system() == "Windows":
+        cmd_path = shutil.which("claude.cmd")
+        if cmd_path:
+            claude_cmd = cmd_path
+        elif shutil.which("claude") is None:
+            raise RuntimeError(
+                "Claude Code CLI not found on $PATH. Install from "
+                "https://claude.ai/code and run `claude` once to authenticate."
+            )
+    elif shutil.which("claude") is None:
+        raise RuntimeError(
+            "Claude Code CLI not found on $PATH. Install from "
+            "https://claude.ai/code and run `claude` once to authenticate."
+        )
+    # Use --system-prompt (replaces) instead of --append-system-prompt (adds
+    # to Claude Code's default coding-agent prompt). The default prompt
+    # pushes the model towards markdown + prose explanations, which conflict
+    # with the "raw JSON only" extraction instruction and cause ~30-50% of
+    # responses to come back wrapped in ```json fences or prefixed with a
+    # preamble — both of which fail the strict json.loads in _parse_llm_json.
+    # Replacing the default prompt eliminates the conflict at the source.
+    # Side benefit: cache-creation tokens per call drop ~19% in practice.
+    # When images are present, append the Read-the-paths instruction and
+    # allowlist each containing directory so the CLI's Read tool can open them.
+    add_dir_args: list[str] = []
+    if images:
+        user_message = _with_image_notes(user_message, images, with_paths=True)
+        seen_dirs: set[str] = set()
+        for r in images:
+            d = str(r.path.parent)
+            if d not in seen_dirs:
+                seen_dirs.add(d)
+                add_dir_args.extend(["--add-dir", d])
+    cli_args = [
+        claude_cmd, "-p",
+        "--output-format", "json",
+        "--no-session-persistence",
+        *add_dir_args,
+        "--system-prompt", _extraction_system(deep=deep_mode),
+    ]
+    # claude-cli defaults to Opus, which is overkill for the structured-JSON
+    # extraction graphify performs. GRAPHIFY_CLAUDE_CLI_MODEL=haiku (or
+    # sonnet, or a full model ID like claude-haiku-4-5-20251001) lets users
+    # opt into a cheaper / faster model. Default behaviour unchanged when
+    # the env var is unset.
+    cli_model = os.environ.get("GRAPHIFY_CLAUDE_CLI_MODEL", "").strip()
+    if cli_model:
+        cli_args.extend(["--model", cli_model])
+    proc = subprocess.run(
+        cli_args,
+        input=user_message,
+        capture_output=True,
+        text=True,
+        encoding="utf-8",  # Force UTF-8 — prevents UnicodeEncodeError on Windows cp1252
+        timeout=_resolve_api_timeout(),
+        check=False,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(
+            f"claude -p exited {proc.returncode}: {proc.stderr.strip()[:500]}"
+        )
+    try:
+        envelope = json.loads(proc.stdout)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(
+            f"claude -p produced unparseable JSON envelope: {exc}; "
+            f"first 500 chars of stdout: {proc.stdout[:500]!r}"
+        ) from exc
+    raw_content = envelope.get("result", "")
+    result = _parse_llm_json(raw_content or "{}")
+    usage = envelope.get("usage") or {}
+    result["input_tokens"] = (
+        int(usage.get("input_tokens", 0) or 0)
+        + int(usage.get("cache_read_input_tokens", 0) or 0)
+        + int(usage.get("cache_creation_input_tokens", 0) or 0)
+    )
+    result["output_tokens"] = int(usage.get("output_tokens", 0) or 0)
+    model_usage = envelope.get("modelUsage") or {}
+    result["model"] = next(iter(model_usage), "claude-code-plan")
+    stop_reason = envelope.get("stop_reason", "")
+    result["finish_reason"] = "length" if stop_reason == "max_tokens" else "stop"
+    if _response_is_hollow(raw_content, result) and result["finish_reason"] != "length":
+        print(
+            "[graphify] claude-cli returned a hollow response; treating as "
+            "truncation so adaptive retry can bisect the chunk.",
+            file=sys.stderr,
+        )
+        result["finish_reason"] = "length"
+    return result
+def _azure_client(api_key: str, endpoint: str):
+    """Construct an AzureOpenAI client with env-driven api_version and timeout."""
+    try:
+        from openai import AzureOpenAI
+    except ImportError as exc:
+        raise ImportError(
+            "Azure OpenAI requires the openai package. Run: pip install openai"
+        ) from exc
+    api_version = os.environ.get("AZURE_OPENAI_API_VERSION", "2024-12-01-preview").strip()
+    timeout_raw = os.environ.get("GRAPHIFY_API_TIMEOUT", "").strip()
+    timeout_s: float = 600.0
+    if timeout_raw:
+        try:
+            v = float(timeout_raw)
+            if v > 0:
+                timeout_s = v
+        except ValueError:
+            pass
+    return AzureOpenAI(api_key=api_key, azure_endpoint=endpoint, api_version=api_version, timeout=timeout_s)
+def _call_azure(
+    api_key: str,
+    endpoint: str,
+    model: str,
+    user_message: str,
+    temperature: float | None = 0,
+    max_tokens: int = 8192,
+    *,
+    deep_mode: bool = False,
+) -> dict:
+    """Call Azure OpenAI Service via the AzureOpenAI SDK client."""
+    client = _azure_client(api_key, endpoint)
+    kwargs: dict = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": _extraction_system(deep=deep_mode)},
+            {"role": "user", "content": user_message},
+        ],
+        "max_completion_tokens": max_tokens,
+    }
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+    resp = client.chat.completions.create(**kwargs)
+    if not resp.choices or resp.choices[0].message is None:
+        raise ValueError("Azure OpenAI returned empty or filtered response")
+    raw_content = resp.choices[0].message.content
+    result = _parse_llm_json(raw_content or "{}")
+    result["input_tokens"] = resp.usage.prompt_tokens if resp.usage else 0
+    result["output_tokens"] = resp.usage.completion_tokens if resp.usage else 0
+    result["model"] = model
+    result["finish_reason"] = resp.choices[0].finish_reason
+    if _response_is_hollow(raw_content, result) and result["finish_reason"] != "length":
+        print(
+            "[graphify] azure returned a hollow response; treating as "
+            "truncation so adaptive retry can bisect the chunk.",
+            file=sys.stderr,
+        )
+        result["finish_reason"] = "length"
+    return result
+def _call_bedrock(model: str, user_message: str, max_tokens: int = 8192, *, deep_mode: bool = False, images: list[_ImageRef] | None = None) -> dict:
+    """Call AWS Bedrock via boto3 Converse API using the standard AWS credential chain."""
+    try:
+        import boto3
+        import botocore.exceptions
+    except ImportError as exc:
+        raise ImportError(
+            "AWS Bedrock extraction requires boto3. Run: pip install graphifyy[bedrock]"
+        ) from exc
+    region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or "us-east-1"
+    profile = os.environ.get("AWS_PROFILE")
+    session = boto3.Session(profile_name=profile, region_name=region)
+    client = session.client("bedrock-runtime")
+    try:
+        resp = client.converse(
+            modelId=model,
+            system=[{"text": _extraction_system(deep=deep_mode)}],
+            messages=[{"role": "user", "content": _bedrock_content(user_message, images or [])}],
+            inferenceConfig={"maxTokens": max_tokens, "temperature": 0},
+        )
+    except botocore.exceptions.ClientError as exc:
+        code = exc.response["Error"]["Code"]
+        msg = exc.response["Error"]["Message"]
+        raise RuntimeError(f"Bedrock API error ({code}): {msg}") from exc
+    text = resp.get("output", {}).get("message", {}).get("content", [{}])[0].get("text", "{}")
+    result = _parse_llm_json(text)
+    usage = resp.get("usage", {})
+    result["input_tokens"] = usage.get("inputTokens", 0)
+    result["output_tokens"] = usage.get("outputTokens", 0)
+    result["model"] = model
+    result["finish_reason"] = "length" if resp.get("stopReason") == "max_tokens" else "stop"
+    if _response_is_hollow(text, result) and result["finish_reason"] != "length":
+        print(
+            "[graphify] bedrock returned a hollow response; treating as "
+            "truncation so adaptive retry can bisect the chunk.",
+            file=sys.stderr,
+        )
+        result["finish_reason"] = "length"
+    return result
+def extract_files_direct(
+    files: list[Path],
+    backend: str | None = None,
+    api_key: str | None = None,
+    model: str | None = None,
+    root: Path = Path("."),
+    *,
+    deep_mode: bool = False,
+) -> dict:
+    """Extract semantic nodes/edges from a list of files using the given backend.
+    Returns dict with nodes, edges, hyperedges, input_tokens, output_tokens.
+    Raises ValueError for unknown backends or when no API key is configured.
+    Raises ImportError if SDK missing.
+    """
+    if backend is None:
+        backend = detect_backend()
+        if backend is None:
+            raise ValueError(
+                "No LLM backend configured. Set one of: GEMINI_API_KEY, ANTHROPIC_API_KEY, "
+                "OPENAI_API_KEY, DEEPSEEK_API_KEY, MOONSHOT_API_KEY, "
+                "AZURE_OPENAI_API_KEY+AZURE_OPENAI_ENDPOINT, OLLAMA_BASE_URL, "
+                "or AWS credentials. Pass backend= explicitly to select a provider."
+            )
+    if backend not in BACKENDS:
+        raise ValueError(f"Unknown backend {backend!r}. Available: {sorted(BACKENDS)}")
+    cfg = BACKENDS[backend]
+    key = api_key or _get_backend_api_key(backend)
+    if not key and backend == "ollama":
+        # Ollama ignores auth but the OpenAI client library requires a non-empty
+        # string. Use a placeholder and surface a visible warning so this never
+        # silently routes traffic without the user realising — see F-029.
+        ollama_url = os.environ.get("OLLAMA_BASE_URL", cfg.get("base_url", ""))
+        _validate_ollama_base_url(ollama_url)
+        print(
+            "[graphify] WARNING: ollama backend selected with no OLLAMA_API_KEY set; "
+            f"sending corpus to {ollama_url}. Set OLLAMA_API_KEY (any non-empty value) "
+            "to suppress this warning.",
+            file=sys.stderr,
+        )
+        key = "ollama"
+    if not key and backend not in ("bedrock", "claude-cli"):
+        raise ValueError(
+            f"No API key for backend '{backend}'. "
+            f"Set {_format_backend_env_keys(backend)} or pass api_key=."
+        )
+    mdl = model or _default_model_for_backend(backend)
+    # Separate raster images from text-like files. Text goes through _read_files
+    # as before; images become structured refs the backend renders as pixels
+    # (vision backends) or as a text reference node (everything else).
+    text_files, image_files = _partition_semantic_files(files)
+    user_msg = _read_files(text_files, root)
+    vision = _backend_supports_vision(backend)
+    # Only base64 (inline) vision backends need the bytes loaded + size-capped;
+    # path-based backends (claude-cli) and non-vision backends do not.
+    read_bytes = vision and backend not in _PATH_IMAGE_BACKENDS
+    image_refs = _build_image_refs(image_files, root, read_bytes=read_bytes) if image_files else []
+    if image_refs and not vision:
+        image_refs = _strip_pixels(image_refs)
+    max_out = _resolve_max_tokens(cfg.get("max_tokens", 8192))
+    if backend == "claude":
+        return _call_claude(key, mdl, user_msg, max_tokens=max_out, deep_mode=deep_mode, images=image_refs)
+    if backend == "claude-cli":
+        return _call_claude_cli(user_msg, max_tokens=max_out, deep_mode=deep_mode, images=image_refs)
+    if backend == "bedrock":
+        return _call_bedrock(mdl, user_msg, max_tokens=max_out, deep_mode=deep_mode, images=image_refs)
+    if backend == "azure":
+        endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT", "").strip()
+        if not endpoint:
+            raise ValueError(
+                "Azure OpenAI backend requires AZURE_OPENAI_ENDPOINT to be set "
+                "(e.g. https://my-resource.openai.azure.com/)."
+            )
+        return _call_azure(
+            key,
+            endpoint,
+            mdl,
+            user_msg,
+            temperature=cfg.get("temperature", 0),
+            max_tokens=max_out,
+            deep_mode=deep_mode,
+        )
+    return _call_openai_compat(
+        cfg["base_url"],
+        key,
+        mdl,
+        user_msg,
+        temperature=cfg.get("temperature", 0),
+        reasoning_effort=cfg.get("reasoning_effort"),
+        max_completion_tokens=_resolve_max_tokens(cfg.get("max_completion_tokens", 8192)),
+        backend=backend,
+        deep_mode=deep_mode,
+        images=image_refs,
+    )
+def _estimate_file_tokens(path: Path) -> int:
+    """Estimate the prompt-token cost of a single file under `_read_files` rules.
+    Uses tiktoken (`cl100k_base`) when available for accurate counts. Falls back
+    to the chars/4 heuristic if tiktoken is not installed. Both paths cap at
+    `_FILE_CHAR_CAP` to match `_read_files`'s truncation, plus a constant for
+    the `=== rel ===` separator. Returns 0 for unreadable paths so they don't
+    blow up packing.
+    """
+    # Raster images are not read as text; a vision model bills them at a roughly
+    # fixed token cost, so estimate by image count rather than (binary) byte size.
+    if _is_vision_image(path):
+        return _IMAGE_TOKEN_ESTIMATE
+    if _TOKENIZER is None:
+        try:
+            size = path.stat().st_size
+        except OSError:
+            return 0
+        chars = min(size, _FILE_CHAR_CAP) + _PER_FILE_OVERHEAD_CHARS
+        return chars // _CHARS_PER_TOKEN
+    try:
+        content = path.read_text(encoding="utf-8", errors="replace")[:_FILE_CHAR_CAP]
+    except OSError:
+        return 0
+    return len(_TOKENIZER.encode(content)) + (_PER_FILE_OVERHEAD_CHARS // _CHARS_PER_TOKEN)
+def _pack_chunks_by_tokens(
+    files: list[Path],
+    token_budget: int,
+) -> list[list[Path]]:
+    """Greedily pack files into chunks that fit a token budget.
+    Files are first grouped by parent directory so related artifacts share a
+    chunk (cross-file edges are more likely to be extracted within a chunk
+    than across chunks). Within each directory, files are added one at a
+    time; a chunk is closed when adding the next file would exceed the
+    budget. A single file larger than the budget gets its own chunk and the
+    caller is expected to handle the API error if it actually overflows the
+    model's context window — packing can't shrink one big file.
+    """
+    if token_budget <= 0:
+        raise ValueError(f"token_budget must be positive, got {token_budget}")
+    by_dir: dict[Path, list[Path]] = {}
+    for f in files:
+        by_dir.setdefault(f.parent, []).append(f)
+    chunks: list[list[Path]] = []
+    current: list[Path] = []
+    current_tokens = 0
+    current_images = 0
+    for directory in sorted(by_dir):
+        for path in by_dir[directory]:
+            cost = _estimate_file_tokens(path)
+            is_image = _is_vision_image(path)
+            over_budget = current_tokens + cost > token_budget
+            over_images = is_image and current_images >= _MAX_IMAGES_PER_CHUNK
+            if current and (over_budget or over_images):
+                chunks.append(current)
+                current = []
+                current_tokens = 0
+                current_images = 0
+            current.append(path)
+            current_tokens += cost
+            current_images += is_image
+    if current:
+        chunks.append(current)
+    return chunks
+_CONTEXT_EXCEEDED_MARKERS = (
+    "context size",
+    "context length",
+    "context_length",
+    "context window",
+    "n_keep",
+    "exceeds the available",
+    "n_ctx",
+    "maximum context",
+    "too many tokens",
+    "prompt is too long",
+    "context_length_exceeded",
+)
+def _looks_like_context_exceeded(exc: BaseException) -> bool:
+    """Heuristically classify an exception as a context-window overflow.
+    Different backends raise different exception types and messages for the
+    same underlying problem ("the prompt + max_completion_tokens did not fit
+    in the model's context window"). We match on substrings of the stringified
+    exception so the retry layer can recover without depending on a specific
+    SDK class. False positives are cheap (we'll re-extract on halves and
+    likely recover); false negatives are expensive (chunk fails entirely).
+    """
+    msg = str(exc).lower()
+    return any(marker in msg for marker in _CONTEXT_EXCEEDED_MARKERS)
+def _extract_with_adaptive_retry(
+    chunk: list[Path],
+    backend: str,
+    api_key: str | None,
+    model: str | None,
+    root: Path,
+    max_depth: int,
+    _depth: int = 0,
+    *,
+    deep_mode: bool = False,
+) -> dict:
+    """Extract a chunk; if the response is truncated (`finish_reason="length"`)
+    or the API rejects the prompt as too large for the model's context window,
+    split the chunk in half and recurse.
+    Three signals drive the retry, all funnelled through the same code:
+    - `finish_reason == "length"` — the model accepted the input but ran out of
+      `max_completion_tokens` mid-output. The truncated JSON is unparseable, so
+      we discard it and re-extract on smaller inputs that produce shorter
+      outputs.
+    - context-window-exceeded API errors — the model rejected the input
+      outright (HTTP 400 from LM Studio, llama.cpp, vLLM, OpenAI, etc.).
+      Without a retry the whole chunk would fail with no output. Splitting in
+      half is the same recovery as for the `length` case and works for the
+      same reason.
+    - hollow successful responses — the model returned HTTP 200 with empty,
+      null, or unparseable content (typical of a local Ollama under load).
+      `_call_openai_compat` re-labels these as `finish_reason="length"` so they
+      take the same recovery path; without that the chunk would be silently
+      dropped from the corpus.
+    Recursion is capped at `max_depth` to bound worst-case cost. A chunk of N
+    files can split into up to 2**max_depth pieces — at depth=3 that's 8x. If
+    still failing at the cap, we surface the (likely empty) result with a
+    warning rather than infinite-loop.
+    A single-file chunk that overflows is unrecoverable here — we can't make
+    one file smaller than itself, so we return what we got and warn.
+    """
+    try:
+        result = extract_files_direct(
+            chunk, backend=backend, api_key=api_key, model=model, root=root, deep_mode=deep_mode
+        )
+    except Exception as exc:  # noqa: BLE001 — re-raise unless it's a known context overflow
+        if not _looks_like_context_exceeded(exc):
+            raise
+        if len(chunk) <= 1:
+            print(
+                f"[graphify] single-file chunk {chunk[0]} exceeds model context "
+                f"and cannot be split further: {exc}",
+                file=sys.stderr,
+            )
+            return {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0, "model": model, "finish_reason": "stop"}
+        if _depth >= max_depth:
+            print(
+                f"[graphify] chunk of {len(chunk)} still overflows context at "
+                f"recursion depth {_depth} (max {max_depth}) — dropping",
+                file=sys.stderr,
+            )
+            return {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0, "model": model, "finish_reason": "stop"}
+        print(
+            f"[graphify] chunk of {len(chunk)} exceeded context at depth "
+            f"{_depth} ({type(exc).__name__}); splitting in half and retrying",
+            file=sys.stderr,
+        )
+        mid = len(chunk) // 2
+        left = _extract_with_adaptive_retry(
+            chunk[:mid], backend, api_key, model, root, max_depth, _depth + 1, deep_mode=deep_mode
+        )
+        right = _extract_with_adaptive_retry(
+            chunk[mid:], backend, api_key, model, root, max_depth, _depth + 1, deep_mode=deep_mode
+        )
+        return {
+            "nodes": left.get("nodes", []) + right.get("nodes", []),
+            "edges": left.get("edges", []) + right.get("edges", []),
+            "hyperedges": left.get("hyperedges", []) + right.get("hyperedges", []),
+            "input_tokens": left.get("input_tokens", 0) + right.get("input_tokens", 0),
+            "output_tokens": left.get("output_tokens", 0) + right.get("output_tokens", 0),
+            "model": model,
+            "finish_reason": "stop",
+        }
+    if result.get("finish_reason") != "length":
+        return result
+    if len(chunk) <= 1:
+        print(
+            f"[graphify] single-file chunk {chunk[0]} truncated at "
+            f"max_completion_tokens — partial result kept",
+            file=sys.stderr,
+        )
+        return result
+    if _depth >= max_depth:
+        print(
+            f"[graphify] chunk of {len(chunk)} still truncated at recursion "
+            f"depth {_depth} (max {max_depth}) — partial result kept",
+            file=sys.stderr,
+        )
+        return result
+    print(
+        f"[graphify] chunk of {len(chunk)} truncated at depth {_depth}, "
+        f"splitting into halves of {len(chunk) // 2} and "
+        f"{len(chunk) - len(chunk) // 2}",
+        file=sys.stderr,
+    )
+    mid = len(chunk) // 2
+    left = _extract_with_adaptive_retry(
+        chunk[:mid], backend, api_key, model, root, max_depth, _depth + 1, deep_mode=deep_mode
+    )
+    right = _extract_with_adaptive_retry(
+        chunk[mid:], backend, api_key, model, root, max_depth, _depth + 1, deep_mode=deep_mode
+    )
+    return {
+        "nodes": left.get("nodes", []) + right.get("nodes", []),
+        "edges": left.get("edges", []) + right.get("edges", []),
+        "hyperedges": left.get("hyperedges", []) + right.get("hyperedges", []),
+        "input_tokens": left.get("input_tokens", 0) + right.get("input_tokens", 0),
+        "output_tokens": left.get("output_tokens", 0) + right.get("output_tokens", 0),
+        "model": result.get("model"),
+        # Both halves either succeeded or have already surfaced their own
+        # truncation warning; the merged result is no longer truncated as a
+        # logical unit.
+        "finish_reason": "stop",
+    }
+def extract_corpus_parallel(
+    files: list[Path],
+    backend: str = "kimi",
+    api_key: str | None = None,
+    model: str | None = None,
+    root: Path = Path("."),
+    chunk_size: int = 20,
+    on_chunk_done: Callable | None = None,
+    token_budget: int | None = 60_000,
+    max_concurrency: int = 4,
+    max_retry_depth: int = 3,
+    deep_mode: bool = False,
+) -> dict:
+    """Extract a corpus in chunks, merging results.
+    Chunking strategy:
+        - If `token_budget` is set (default 60_000), files are packed to fit
+          the budget and grouped by parent directory. This avoids the worst
+          case where 20 randomly-grouped files exceed a model's context
+          window in a single request.
+        - If `token_budget=None`, falls back to the legacy fixed-count
+          `chunk_size` packing for backwards compatibility.
+    Concurrency:
+        - Chunks run in parallel via a thread pool capped at `max_concurrency`
+          (default 4 — conservative to stay under provider rate limits).
+        - Set `max_concurrency=1` to force sequential execution.
+    Adaptive retry on truncation:
+        - When the LLM returns `finish_reason="length"` (output truncated at
+          `max_completion_tokens`), the chunk is split in half and each half
+          re-extracted recursively, up to `max_retry_depth` levels deep
+          (default 3 → max 8x expansion of one chunk).
+        - This is signal-driven: chunks too dense to fit in one response
+          self-heal by splitting until they do, while well-sized chunks pay
+          no extra cost. Set `max_retry_depth=0` to disable retries.
+    `on_chunk_done(idx, total, chunk_result)` fires once per chunk as it
+    completes (in completion order, not submission order). `idx` is the
+    chunk's submission index so callers can correlate progress. The
+    callback fires once per top-level chunk; recursive splits are merged
+    transparently before the callback is invoked.
+    Returns merged dict with nodes, edges, hyperedges, input_tokens,
+    output_tokens. Failed chunks are logged to stderr and skipped — one bad
+    chunk does not abort the run.
+    """
+    if token_budget is not None:
+        chunks = _pack_chunks_by_tokens(files, token_budget=token_budget)
+    else:
+        chunks = [files[i:i + chunk_size] for i in range(0, len(files), chunk_size)]
+    merged: dict = {
+        "nodes": [], "edges": [], "hyperedges": [],
+        "input_tokens": 0, "output_tokens": 0,
+        "failed_chunks": 0,  # count of chunks that raised — loud failure on chunk errors
+    }
+    total = len(chunks)
+    def _run_one(idx: int, chunk: list[Path]) -> tuple[int, dict | None, Exception | None]:
+        t0 = time.time()
+        try:
+            result = _extract_with_adaptive_retry(
+                chunk,
+                backend=backend,
+                api_key=api_key,
+                model=model,
+                root=root,
+                max_depth=max_retry_depth,
+                deep_mode=deep_mode,
+            )
+            result["elapsed_seconds"] = round(time.time() - t0, 2)
+            return idx, result, None
+        except Exception as exc:  # noqa: BLE001 — caller-facing surface, log + continue
+            return idx, None, exc
+    # Ollama serves one request at a time per loaded model on a single GPU.
+    # Four concurrent 60k-token requests cause VRAM pressure and hollow
+    # responses after 3-4 chunks (#798). Force serial unless the user opts in.
+    if backend == "ollama" and os.environ.get("GRAPHIFY_OLLAMA_PARALLEL", "").strip() != "1":
+        max_concurrency = 1
+    # claude-cli shells out to a Claude Code session; parallel subprocesses conflict
+    # over session state. Force serial unless the user explicitly opts in.
+    if backend == "claude-cli" and os.environ.get("GRAPHIFY_CLAUDE_CLI_PARALLEL", "").strip() != "1":
+        max_concurrency = 1
+    workers = max(1, min(max_concurrency, total))
+    if workers == 1:
+        # Avoid thread pool overhead for single-worker runs (and keep
+        # callback ordering identical to the pre-refactor sequential path).
+        for idx, chunk in enumerate(chunks):
+            _, result, exc = _run_one(idx, chunk)
+            if exc is not None:
+                print(f"[graphify] chunk {idx + 1}/{total} failed: {exc}", file=sys.stderr)
+                merged["failed_chunks"] += 1
+                continue
+            assert result is not None
+            _merge_into(merged, result)
+            if callable(on_chunk_done):
+                on_chunk_done(idx, total, result)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = [pool.submit(_run_one, idx, chunk) for idx, chunk in enumerate(chunks)]
+            for future in as_completed(futures):
+                idx, result, exc = future.result()
+                if exc is not None:
+                    print(
+                        f"[graphify] chunk {idx + 1}/{total} failed: {exc}",
+                        file=sys.stderr,
+                    )
+                    merged["failed_chunks"] += 1
+                    continue
+                assert result is not None
+                _merge_into(merged, result)
+                if callable(on_chunk_done):
+                    on_chunk_done(idx, total, result)
+    # Loud failure summary — surface chunk failures at end so they're never
+    # buried mid-log. Exit 0 preserved for caller compatibility; the
+    # summary block makes the problem visible.
+    if merged["failed_chunks"] > 0:
+        print(
+            f"[graphify] WARNING: {merged['failed_chunks']}/{total} semantic chunk(s) failed"
+            " — see errors above. Partial results returned.",
+            file=sys.stderr,
+        )
+    return merged
+def _merge_into(merged: dict, result: dict) -> None:
+    """Append a chunk result into the running merged accumulator."""
+    merged["nodes"].extend(result.get("nodes", []))
+    merged["edges"].extend(result.get("edges", []))
+    merged["hyperedges"].extend(result.get("hyperedges", []))
+    merged["input_tokens"] += result.get("input_tokens", 0)
+    merged["output_tokens"] += result.get("output_tokens", 0)
+def _call_llm(prompt: str, *, backend: str, max_tokens: int = 200) -> str:
+    """Send a plain-text prompt to `backend` and return the model's text reply.
+    Used by lightweight callers (e.g. `graphify.dedup` LLM tiebreaker) that
+    don't need the full extraction prompt or JSON-shaped output. Mirrors the
+    backend dispatch logic of `extract_files_direct` but skips the
+    `_EXTRACTION_SYSTEM` prompt and JSON parsing.
+    Previously `graphify.dedup` imported a `_call_llm` symbol that did not
+    exist in this module, so the LLM tiebreaker silently no-op'd on
+    `ImportError` (F-038). Adding the function here re-enables it.
+    """
+    if backend not in BACKENDS:
+        raise ValueError(f"Unknown backend {backend!r}")
+    cfg = BACKENDS[backend]
+    key = _get_backend_api_key(backend)
+    if not key and backend == "ollama":
+        ollama_url = os.environ.get("OLLAMA_BASE_URL", cfg.get("base_url", ""))
+        _validate_ollama_base_url(ollama_url)
+        key = "ollama"
+    if not key and backend not in ("bedrock", "claude-cli"):
+        raise ValueError(
+            f"No API key for backend '{backend}'. Set {_format_backend_env_keys(backend)}."
+        )
+    mdl = _default_model_for_backend(backend)
+    if backend == "claude":
+        try:
+            import anthropic
+        except ImportError as exc:
+            raise ImportError(_backend_pkg_hint("anthropic", "anthropic")) from exc
+        client = anthropic.Anthropic(api_key=key)
+        resp = client.messages.create(
+            model=mdl,
+            max_tokens=max_tokens,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return resp.content[0].text if resp.content else ""
+    if backend == "claude-cli":
+        import shutil, subprocess
+        if shutil.which("claude") is None:
+            raise RuntimeError("Claude Code CLI not found on $PATH")
+        proc = subprocess.run(
+            ["claude", "-p", "--output-format", "json", "--no-session-persistence"],
+            input=prompt,
+            capture_output=True,
+            text=True,
+            encoding="utf-8",  # Force UTF-8 — prevents UnicodeEncodeError on Windows cp1252
+            timeout=_resolve_api_timeout(),
+            check=False,
+        )
+        if proc.returncode != 0:
+            raise RuntimeError(f"claude -p exited {proc.returncode}: {proc.stderr.strip()[:500]}")
+        try:
+            envelope = json.loads(proc.stdout)
+        except json.JSONDecodeError as exc:
+            raise RuntimeError(f"claude -p produced unparseable JSON envelope: {exc}") from exc
+        return envelope.get("result", "")
+    if backend == "bedrock":
+        try:
+            import boto3
+        except ImportError as exc:
+            raise ImportError(_backend_pkg_hint("boto3", "bedrock")) from exc
+        region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or "us-east-1"
+        profile = os.environ.get("AWS_PROFILE")
+        session = boto3.Session(profile_name=profile, region_name=region)
+        client = session.client("bedrock-runtime")
+        resp = client.converse(
+            modelId=mdl,
+            messages=[{"role": "user", "content": [{"text": prompt}]}],
+            inferenceConfig={"maxTokens": max_tokens, "temperature": 0},
+        )
+        return resp.get("output", {}).get("message", {}).get("content", [{}])[0].get("text", "")
+    if backend == "azure":
+        endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT", "").strip()
+        if not endpoint:
+            raise ValueError(
+                "Azure OpenAI backend requires AZURE_OPENAI_ENDPOINT to be set."
+            )
+        azure_client = _azure_client(key, endpoint)
+        resp = azure_client.chat.completions.create(
+            model=mdl,
+            messages=[{"role": "user", "content": prompt}],
+            max_completion_tokens=max_tokens,
+            temperature=cfg.get("temperature", 0),
+        )
+        if not resp.choices or resp.choices[0].message is None:
+            raise ValueError("Azure OpenAI returned empty or filtered response")
+        return resp.choices[0].message.content or ""
+    # OpenAI-compatible (kimi, openai, gemini, ollama)
+    try:
+        from openai import OpenAI
+    except ImportError as exc:
+        raise ImportError(_backend_pkg_hint("openai", "openai")) from exc
+    client = OpenAI(api_key=key, base_url=cfg["base_url"])
+    kwargs: dict = {
+        "model": mdl,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_completion_tokens": max_tokens,
+    }
+    temperature = cfg.get("temperature", 0)
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+    if cfg.get("reasoning_effort"):
+        kwargs["reasoning_effort"] = cfg["reasoning_effort"]
+    if "moonshot" in cfg["base_url"]:
+        kwargs["extra_body"] = {"thinking": {"type": "disabled"}}
+    resp = client.chat.completions.create(**kwargs)
+    if not resp.choices or resp.choices[0].message is None:
+        raise ValueError("LLM returned empty or filtered response")
+    return resp.choices[0].message.content or ""
+def estimate_cost(backend: str, input_tokens: int, output_tokens: int) -> float:
+    """Estimate USD cost for a given token count using published pricing."""
+    if backend not in BACKENDS:
+        return 0.0
+    p = BACKENDS[backend]["pricing"]
+    return (input_tokens * p["input"] + output_tokens * p["output"]) / 1_000_000
+def _ollama_host_is_link_local_or_metadata(host: str) -> bool:
+    """True if *host* is, or resolves to, a link-local / cloud-metadata address.
+    Resolves the name so an alias pointing at 169.254.169.254 is caught too, not
+    just a literal IP. General private/LAN addresses are deliberately NOT treated
+    as metadata: people do run Ollama on trusted LAN boxes, so those only warn.
+    """
+    import ipaddress
+    import socket
+    if host in ("metadata.google.internal", "metadata.google.com", "0.0.0.0", "::", "[::]"):  # nosec B104 - blocklist, not a bind
+        return True
+    if host.startswith("169.254."):  # link-local literal, includes the metadata IP
+        return True
+    try:
+        infos = socket.getaddrinfo(host, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
+    except (socket.gaierror, UnicodeError, OSError):
+        return False
+    for info in infos:
+        try:
+            ip = ipaddress.ip_address(info[4][0])
+        except ValueError:
+            continue
+        if ip.is_link_local:  # 169.254.0.0/16 and fe80::/10 (includes the metadata IP)
+            return True
+    return False
+def _validate_ollama_base_url(url: str, *, warn: bool = True) -> None:
+    """Warn if OLLAMA_BASE_URL looks unsafe; hard-block link-local/metadata (F3).
+    Sending an entire corpus to a non-loopback http:// endpoint silently leaks
+    proprietary code, but some users genuinely run Ollama on a LAN host they
+    trust, so a general non-loopback target only warns. A link-local or cloud
+    metadata address (169.254.x, metadata.google.*, or any host that resolves to
+    one) is never a legitimate Ollama host and is a classic SSRF target, so we
+    fail closed with a ValueError there regardless of *warn*. Pass warn=False for
+    an early gate that should hard-block but leave the user-facing warning to the
+    later in-flow call.
+    """
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+    except Exception:
+        if warn:
+            print(
+                f"[graphify] WARNING: OLLAMA_BASE_URL={url!r} is not a parseable URL.",
+                file=sys.stderr,
+            )
+        return
+    if parsed.scheme not in ("http", "https"):
+        if warn:
+            print(
+                f"[graphify] WARNING: OLLAMA_BASE_URL has unexpected scheme {parsed.scheme!r}; "
+                "expected http or https.",
+                file=sys.stderr,
+            )
+        return
+    host = (parsed.hostname or "").lower()
+    if _ollama_host_is_link_local_or_metadata(host):
+        raise ValueError(
+            f"OLLAMA_BASE_URL points at a link-local/metadata address ({host!r}); refusing to "
+            "send the corpus there. Set it to a real Ollama host."
+        )
+    is_loopback = host in ("localhost", "127.0.0.1", "::1") or host.startswith("127.")
+    if warn and not is_loopback:
+        scheme_note = " (UNENCRYPTED)" if parsed.scheme == "http" else ""
+        print(
+            f"[graphify] WARNING: OLLAMA_BASE_URL points to non-loopback host {host!r}{scheme_note}. "
+            "Your full corpus will be sent to that endpoint. "
+            "Set OLLAMA_BASE_URL=http://localhost:11434/v1 to keep extraction local.",
+            file=sys.stderr,
+        )
+def detect_backend() -> str | None:
+    """Return the name of whichever backend has an API key set, or None.
+    Priority: gemini → kimi → claude → openai → deepseek → azure → bedrock → ollama (last, opt-in).
+    Ollama is intentionally checked LAST so a paid API key (Anthropic/OpenAI/etc.)
+    is never silently shadowed by an incidental OLLAMA_BASE_URL in the environment
+    — see security finding F-002/F-029. Setting OLLAMA_BASE_URL alongside a paid
+    key now keeps you on the paid backend; remove the paid key (or pass
+    --backend ollama explicitly) to route to the local model.
+    """
+    for backend in ("gemini", "kimi", "claude", "openai", "deepseek"):
+        if _get_backend_api_key(backend):
+            return backend
+    if _get_backend_api_key("azure") and os.environ.get("AZURE_OPENAI_ENDPOINT"):
+        return "azure"
+    if os.environ.get("AWS_PROFILE") or os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"):
+        return "bedrock"
+    ollama_url = os.environ.get("OLLAMA_BASE_URL")
+    if ollama_url:
+        _validate_ollama_base_url(ollama_url)
+        return "ollama"
+    for name in BACKENDS:
+        if name not in ("gemini", "kimi", "claude", "openai", "deepseek", "azure", "bedrock", "ollama", "claude-cli"):
+            if _get_backend_api_key(name):
+                return name
+    return None
+# ── Community labeling ────────────────────────────────────────────────────────
+# When graphify runs inside an orchestrating agent (Claude Code / Gemini CLI),
+# the agent names communities itself per skill.md Step 5 - it reads the analysis
+# file and writes 2-5 word names with its own reasoning, no API call. When
+# graphify is run as a bare CLI (``graphify extract . --backend X``), there is no
+# agent to do that step, so community labels stay ``Community 0/1/2...``. These
+# helpers fill that gap: ask the configured backend to name communities in ONE
+# batched call and return a complete ``{cid: name}`` map (#1097).
+_LABEL_FENCE_RE = re.compile(r"^\s*```(?:json)?\s*|\s*```\s*$", re.IGNORECASE)
+_LABEL_MAX_COMMUNITIES = 200   # cap LLM-named communities; tail stays placeholder
+_LABEL_TOP_K = 12              # node labels sampled per community for the prompt
+_LABEL_MAXLEN = 60             # truncate individual labels to keep the prompt small
+def _placeholder_community_labels(communities) -> dict[int, str]:
+    return {int(cid): f"Community {cid}" for cid in communities}
+def _community_label_lines(G, communities, gods, max_communities, top_k):
+    """One prompt line per community (largest first), sampling up to ``top_k``
+    representative node labels (god nodes first). Returns (lines, labeled_cids);
+    skips communities with no resolvable nodes."""
+    # gods may be node-id strings or god_nodes() dicts ({"id": ..., "label": ...}).
+    god_set = {g["id"] if isinstance(g, dict) else g for g in (gods or [])}
+    ordered = sorted(communities.items(), key=lambda kv: -len(kv[1]))
+    lines: list[str] = []
+    labeled_cids: list[int] = []
+    for cid, members in ordered[:max_communities]:
+        ranked = [m for m in members if m in god_set] + [m for m in members if m not in god_set]
+        names: list[str] = []
+        seen: set[str] = set()
+        for nid in ranked:
+            label = str(G.nodes[nid].get("label", nid)) if nid in G.nodes else str(nid)
+            label = label.strip().strip("()")[:_LABEL_MAXLEN]
+            if label and label.lower() not in seen:
+                seen.add(label.lower())
+                names.append(label)
+            if len(names) >= top_k:
+                break
+        if names:
+            lines.append(f"Community {cid}: {', '.join(names)}")
+            labeled_cids.append(int(cid))
+    return lines, labeled_cids
+def _parse_label_response(text: str, labeled_cids: list[int]) -> dict[int, str]:
+    """Parse the backend's JSON ``{cid: name}`` reply. Raises on non-JSON or a
+    non-object payload; silently ignores cids it didn't name."""
+    cleaned = _LABEL_FENCE_RE.sub("", text.strip())
+    if not cleaned.startswith("{"):
+        start, end = cleaned.find("{"), cleaned.rfind("}")
+        if start != -1 and end > start:
+            cleaned = cleaned[start:end + 1]
+    data = json.loads(cleaned)
+    if not isinstance(data, dict):
+        raise ValueError("label response is not a JSON object")
+    out: dict[int, str] = {}
+    for cid in labeled_cids:
+        name = data.get(str(cid))
+        if name is None:
+            name = data.get(cid)
+        if isinstance(name, str) and name.strip():
+            out[cid] = name.strip()
+    return out
+def label_communities(
+    G,
+    communities,
+    *,
+    backend: str,
+    gods=None,
+    max_communities: int = _LABEL_MAX_COMMUNITIES,
+    top_k: int = _LABEL_TOP_K,
+) -> dict[int, str]:
+    """Return a complete ``{cid: name}`` map using ``backend`` for naming.
+    Placeholders (``Community N``) are used for any community the backend did not
+    name. Raises on backend/parse failure - callers that want graceful
+    degradation should use :func:`generate_community_labels`.
+    """
+    labels = _placeholder_community_labels(communities)
+    lines, labeled_cids = _community_label_lines(G, communities, gods, max_communities, top_k)
+    if not lines:
+        return labels
+    prompt = (
+        "You are naming clusters in a knowledge graph. For each community below, "
+        "return a concise 2-5 word plain-language name describing what it is about "
+        "(e.g. \"Order Management\", \"Payment Flow\", \"Auth Middleware\"). "
+        "Respond ONLY with a JSON object mapping the community id (as a string) to "
+        "its name - no prose, no markdown fences.\n\n" + "\n".join(lines)
+    )
+    max_tokens = min(40 + 16 * len(labeled_cids), 4096)
+    text = _call_llm(prompt, backend=backend, max_tokens=max_tokens)
+    labels.update(_parse_label_response(text, labeled_cids))
+    return labels
+def generate_community_labels(
+    G,
+    communities,
+    *,
+    backend: str | None = None,
+    gods=None,
+    quiet: bool = False,
+) -> tuple[dict[int, str], str]:
+    """CLI entry point: resolve a backend, name communities, and degrade to
+    ``Community N`` placeholders on any failure (no backend, API error, malformed
+    reply). Returns ``(labels, source)`` where source is ``"llm"`` or
+    ``"placeholder"``. Never raises."""
+    if backend is None:
+        try:
+            backend = detect_backend()
+        except Exception:
+            backend = None
+    if not backend:
+        if not quiet:
+            print(
+                "[graphify label] no LLM backend configured; keeping Community N "
+                "placeholders. Set an API key (e.g. GOOGLE_API_KEY) or pass --backend.",
+                file=sys.stderr,
+            )
+        return _placeholder_community_labels(communities), "placeholder"
+    try:
+        labels = label_communities(G, communities, backend=backend, gods=gods)
+        return labels, "llm"
+    except Exception as exc:
+        if not quiet:
+            print(
+                f"[graphify label] warning: community labeling failed ({exc}); "
+                "using Community N placeholders.",
+                file=sys.stderr,
+            )
+        return _placeholder_community_labels(communities), "placeholder"