PyPI - docsgraph - Versions diffs - 0.1.0a2__py3-none-any.whl - Mend

docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

cairn/__init__.py +5 -0
cairn/bench/__init__.py +37 -0
cairn/bench/baseline.py +236 -0
cairn/bench/dataset.py +109 -0
cairn/bench/judge.py +126 -0
cairn/bench/metrics.py +32 -0
cairn/bench/report.py +143 -0
cairn/bench/runner.py +219 -0
cairn/cli/__init__.py +5 -0
cairn/cli/app.py +776 -0
cairn/cli/config.py +105 -0
cairn/core/__init__.py +41 -0
cairn/core/errors.py +68 -0
cairn/core/types.py +147 -0
cairn/embed/__init__.py +17 -0
cairn/embed/base.py +31 -0
cairn/embed/doubao.py +167 -0
cairn/embed/fake.py +36 -0
cairn/embed/openai_compatible.py +155 -0
cairn/engine/__init__.py +18 -0
cairn/engine/indexer.py +298 -0
cairn/engine/manifest.py +83 -0
cairn/entity/__init__.py +21 -0
cairn/entity/base.py +52 -0
cairn/entity/fake.py +34 -0
cairn/entity/heuristic.py +148 -0
cairn/index/__init__.py +39 -0
cairn/index/entities.py +244 -0
cairn/index/summaries.py +269 -0
cairn/index/tree.py +274 -0
cairn/index/vectors.py +287 -0
cairn/index/xrefs.py +195 -0
cairn/ingest/__init__.py +36 -0
cairn/ingest/base.py +46 -0
cairn/ingest/markdown.py +244 -0
cairn/ingest/markitdown.py +145 -0
cairn/ingest/pdf.py +357 -0
cairn/inspection.py +971 -0
cairn/mcp/__init__.py +12 -0
cairn/mcp/schemas.py +547 -0
cairn/mcp/server.py +363 -0
cairn/providers.py +50 -0
cairn/py.typed +0 -0
cairn/repo.py +1486 -0
cairn/repo_search.py +1505 -0
cairn/summarize/__init__.py +18 -0
cairn/summarize/base.py +56 -0
cairn/summarize/cache.py +66 -0
cairn/summarize/fake.py +43 -0
cairn/summarize/openai_compatible.py +148 -0
cairn/summarize/prompts.py +73 -0
cairn/tools/__init__.py +31 -0
cairn/tools/base.py +126 -0
cairn/tools/find_mentions.py +93 -0
cairn/tools/get_related.py +140 -0
cairn/tools/get_section.py +130 -0
cairn/tools/outline.py +75 -0
cairn/tools/read_range.py +94 -0
cairn/tools/search_keyword.py +94 -0
cairn/tools/search_semantic.py +181 -0
cairn/xref/__init__.py +24 -0
cairn/xref/base.py +50 -0
cairn/xref/fake.py +40 -0
cairn/xref/heuristic.py +217 -0
docsgraph-0.1.0a2.dist-info/METADATA +688 -0
docsgraph-0.1.0a2.dist-info/RECORD +69 -0
docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0

cairn/summarize/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Summarization layer — pluggable LLM-backed summarizers + cache.
+Used by the index layer (`cairn.index.summaries.SummaryBuilder`) at indexing
+time. Never invoked at query time. See ARCHITECTURE.md §2.2.
+"""
+from cairn.summarize.base import Summarizer, SummaryLevel
+from cairn.summarize.cache import SummaryCache
+from cairn.summarize.fake import FakeSummarizer
+from cairn.summarize.openai_compatible import OpenAICompatibleSummarizer
+__all__ = [
+    "FakeSummarizer",
+    "OpenAICompatibleSummarizer",
+    "Summarizer",
+    "SummaryCache",
+    "SummaryLevel",
+]

cairn/summarize/base.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Summarizer protocol and level enum.
+A `Summarizer` produces a single summary string for a section at a given
+granularity level. Pre-computed during indexing; never invoked at query time.
+"""
+from __future__ import annotations
+from enum import StrEnum
+from typing import Protocol, runtime_checkable
+class SummaryLevel(StrEnum):
+    """The three granularity levels Cairn supports.
+    - ``GIST``: ≤ 20 words. The "scent" in IFT terms; used by ``outline``.
+    - ``SYNOPSIS``: ≤ 80 words. Used by ``get_section`` (default) and search hits.
+    - ``DIGEST``: ≤ 300 words. Used by ``expand`` and ``get_section(level="digest")``.
+    """
+    GIST = "gist"
+    SYNOPSIS = "synopsis"
+    DIGEST = "digest"
+@runtime_checkable
+class Summarizer(Protocol):
+    """A pluggable summarizer.
+    Implementations should be deterministic for ``(title, body, level)`` when
+    possible — use ``temperature=0`` and fixed prompts. The ``name`` attribute
+    must encode both the implementation family and the model identifier so
+    cache keys correctly invalidate when either changes.
+    Examples of valid ``name`` values::
+        "fake:words"
+        "openai-compat:gpt-4o-mini"
+        "openai-compat:llama3.2:3b"
+    """
+    name: str
+    async def summarize(
+        self,
+        *,
+        title: str,
+        body: str,
+        level: SummaryLevel,
+    ) -> str:
+        """Produce a summary of ``body`` (titled ``title``) at ``level``.
+        Implementations must enforce the level's word budget on the output
+        (see ``cairn.summarize.prompts.WORD_BUDGETS``).
+        """
+        ...

cairn/summarize/cache.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""File-system cache for summarizer outputs.
+Keyed by ``sha256(model || level || section_hash)``. Each entry is a single
+UTF-8 text file under ``<root>/<first2hex>/<remaining>.txt``. Writes are
+atomic: temp-file + rename. Concurrent writers may race; the winner's
+content is kept (acceptable because identical inputs should yield identical
+outputs from a deterministic summarizer).
+"""
+from __future__ import annotations
+import hashlib
+import os
+from pathlib import Path
+class SummaryCache:
+    """Local file-system cache for ``Summarizer`` outputs."""
+    def __init__(self, root: Path) -> None:
+        self.root = root
+    # -- key / path helpers -------------------------------------------------
+    @staticmethod
+    def key(*, model: str, level: str, section_hash: str) -> str:
+        """Compute the cache key for one (model, level, section) tuple."""
+        h = hashlib.sha256()
+        h.update(model.encode("utf-8"))
+        h.update(b"\x00")
+        h.update(level.encode("utf-8"))
+        h.update(b"\x00")
+        h.update(section_hash.encode("utf-8"))
+        return h.hexdigest()
+    def _path_for(self, key: str) -> Path:
+        return self.root / key[:2] / f"{key[2:]}.txt"
+    # -- public API ---------------------------------------------------------
+    def get(self, key: str) -> str | None:
+        """Return the cached summary or ``None`` if absent."""
+        path = self._path_for(key)
+        if not path.exists():
+            return None
+        return path.read_text(encoding="utf-8")
+    def put(self, key: str, value: str) -> None:
+        """Write a cache entry atomically."""
+        path = self._path_for(key)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = path.with_suffix(path.suffix + ".tmp")
+        tmp.write_text(value, encoding="utf-8")
+        os.replace(tmp, path)
+    def clear(self) -> None:
+        """Remove the entire cache directory. Safe if it doesn't exist."""
+        if not self.root.exists():
+            return
+        for path in sorted(self.root.rglob("*"), reverse=True):
+            if path.is_file():
+                path.unlink()
+            elif path.is_dir():
+                path.rmdir()
+        if self.root.exists() and self.root.is_dir():
+            self.root.rmdir()

cairn/summarize/fake.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Deterministic, network-free summarizer for tests and dry runs.
+Not for production use. Output is a word-truncated prefix of the body, which
+preserves enough structure for downstream sanity checks while requiring no
+LLM and no network.
+"""
+from __future__ import annotations
+import re
+from typing import ClassVar
+from cairn.summarize.base import SummaryLevel
+_WORD = re.compile(r"\S+")
+class FakeSummarizer:
+    """Word-truncation summarizer. Deterministic; no network."""
+    name = "fake:words"
+    _BUDGETS: ClassVar[dict[SummaryLevel, int]] = {
+        SummaryLevel.GIST: 15,
+        SummaryLevel.SYNOPSIS: 60,
+        SummaryLevel.DIGEST: 200,
+    }
+    async def summarize(
+        self,
+        *,
+        title: str,
+        body: str,
+        level: SummaryLevel,
+    ) -> str:
+        budget = self._BUDGETS[level]
+        words = _WORD.findall(body)
+        if not words:
+            return f"{title.strip() or 'Section'}."
+        truncated = " ".join(words[:budget])
+        if len(words) > budget:
+            truncated += "…"
+        return truncated

cairn/summarize/openai_compatible.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""OpenAI-compatible HTTP summarizer.
+Works with any endpoint that implements the OpenAI ``/v1/chat/completions``
+contract: OpenAI itself, Ollama (``http://localhost:11434/v1``), vLLM,
+Together, Anyscale, etc.
+This is the **default** summarizer for production indexing in v0.1. It must
+remain usable without proprietary credentials (point it at a local Ollama
+instance) — per CLAUDE.md P4 "local-first must always work".
+"""
+from __future__ import annotations
+import asyncio
+from typing import Any
+import httpx
+from cairn.core.errors import IndexBuildError
+from cairn.summarize.base import SummaryLevel
+from cairn.summarize.prompts import SYSTEM_PROMPT, enforce_word_budget, user_prompt
+class OpenAICompatibleSummarizer:
+    """OpenAI-compatible chat-completions client."""
+    def __init__(
+        self,
+        *,
+        base_url: str = "http://localhost:11434/v1",
+        model: str = "llama3.2:3b",
+        api_key: str | None = None,
+        timeout: float = 60.0,
+        temperature: float = 0.0,
+        max_retries: int = 2,
+        retry_base_delay: float = 0.5,
+    ) -> None:
+        if max_retries < 0:
+            msg = f"max_retries must be >= 0; got {max_retries}"
+            raise ValueError(msg)
+        if retry_base_delay < 0:
+            msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
+            raise ValueError(msg)
+        self.base_url = base_url.rstrip("/")
+        self.model = model
+        self.api_key = api_key
+        self.timeout = timeout
+        self.temperature = temperature
+        self.max_retries = max_retries
+        self.retry_base_delay = retry_base_delay
+        self.name = f"openai-compat:{model}"
+    async def summarize(
+        self,
+        *,
+        title: str,
+        body: str,
+        level: SummaryLevel,
+    ) -> str:
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "temperature": self.temperature,
+            "messages": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt(title, body, level)},
+            ],
+        }
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await self._post_with_retries(client, payload, headers)
+            data = response.json()
+        try:
+            text = str(data["choices"][0]["message"]["content"]).strip()
+        except (KeyError, IndexError, TypeError) as exc:
+            msg = "summarizer response did not match OpenAI chat-completions shape"
+            raise IndexBuildError(msg, details={"response": data}) from exc
+        return enforce_word_budget(text, level)
+    async def _post_with_retries(
+        self,
+        client: httpx.AsyncClient,
+        payload: dict[str, Any],
+        headers: dict[str, str],
+    ) -> httpx.Response:
+        last_exc: httpx.HTTPError | None = None
+        for attempt in range(self.max_retries + 1):
+            try:
+                response = await client.post(
+                    f"{self.base_url}/chat/completions",
+                    json=payload,
+                    headers=headers,
+                )
+            except httpx.HTTPError as exc:
+                last_exc = exc
+                if attempt < self.max_retries:
+                    await self._sleep_before_retry(attempt)
+                    continue
+                msg = f"summarizer request failed: {exc}"
+                raise IndexBuildError(
+                    msg,
+                    details={
+                        "model": self.model,
+                        "base_url": self.base_url,
+                        "error_type": type(exc).__name__,
+                        "attempts": attempt + 1,
+                    },
+                ) from exc
+            if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
+                await self._sleep_before_retry(attempt)
+                continue
+            if response.status_code >= 400:
+                msg = (
+                    f"summarizer endpoint returned HTTP {response.status_code}: "
+                    f"{response.text[:200]}"
+                )
+                raise IndexBuildError(
+                    msg,
+                    details={
+                        "status": response.status_code,
+                        "model": self.model,
+                        "base_url": self.base_url,
+                        "attempts": attempt + 1,
+                    },
+                )
+            return response
+        # Unreachable, but keeps strict type-checkers honest if the loop changes.
+        msg = "summarizer request failed without a response"
+        raise IndexBuildError(
+            msg,
+            details={
+                "model": self.model,
+                "base_url": self.base_url,
+                "error_type": type(last_exc).__name__ if last_exc else None,
+            },
+        )
+    async def _sleep_before_retry(self, attempt: int) -> None:
+        if self.retry_base_delay == 0:
+            return
+        await asyncio.sleep(self.retry_base_delay * (2**attempt))

cairn/summarize/prompts.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Prompt templates and word budgets for summarization.
+The prompts are deliberately terse. We trust word budgets more than
+elaborately worded instructions; the budget is enforced at the call site
+regardless of model compliance.
+"""
+from __future__ import annotations
+import re
+from typing import Final
+from cairn.summarize.base import SummaryLevel
+WORD_BUDGETS: Final[dict[SummaryLevel, int]] = {
+    SummaryLevel.GIST: 20,
+    SummaryLevel.SYNOPSIS: 80,
+    SummaryLevel.DIGEST: 300,
+}
+SYSTEM_PROMPT: Final = (
+    "You write structural document summaries for a hierarchical retrieval "
+    "system.\n"
+    "- Be precise and factual. Do not interpret, extrapolate, or add opinions.\n"
+    "- Do not begin with 'This section…', 'The author…', or similar preamble.\n"
+    "- Output ONLY the summary text. No headers, labels, or quotation marks.\n"
+    "- Stay strictly within the word budget.\n"
+)
+def user_prompt(title: str, body: str, level: SummaryLevel) -> str:
+    """Build the user-role prompt for one summary request."""
+    budget = WORD_BUDGETS[level]
+    if level is SummaryLevel.GIST:
+        instruction = (
+            f"Summarize the section below in a single sentence of at most "
+            f"{budget} words. Capture the single most important fact or claim."
+        )
+    elif level is SummaryLevel.SYNOPSIS:
+        instruction = (
+            f"Summarize the section below in one paragraph of at most "
+            f"{budget} words. Cover the main idea, key specifics, and what "
+            "the reader will learn."
+        )
+    else:  # DIGEST
+        instruction = (
+            f"Summarize the section below in 2 to 3 short paragraphs, totaling "
+            f"at most {budget} words. Preserve structural ordering and any "
+            "concrete facts (names, numbers, code identifiers)."
+        )
+    body_excerpt = body.strip() or "(empty section body)"
+    return (
+        f"{instruction}\n\n"
+        f"SECTION TITLE: {title}\n\n"
+        f"SECTION BODY:\n{body_excerpt}"
+    )
+_WORD = re.compile(r"\S+")
+def enforce_word_budget(text: str, level: SummaryLevel) -> str:
+    """Soft-truncate ``text`` to the level's word budget at a word boundary.
+    Appends a horizontal ellipsis (``…``) when truncation occurred. Returns
+    the original text untouched when already within budget.
+    """
+    budget = WORD_BUDGETS[level]
+    words = _WORD.findall(text)
+    if len(words) <= budget:
+        return text
+    return " ".join(words[:budget]) + "…"

cairn/tools/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Retrieval tools — the public API consumed by the MCP server.
+Each tool corresponds 1:1 to an MCP tool documented in
+``docs/specs/mcp-tools.md``. Tools accept a :class:`DocumentIndex` plus typed
+arguments and return a :class:`ToolResponse`. They do not speak MCP
+themselves; the ``cairn.mcp`` layer translates :class:`ToolResponse` and
+:class:`cairn.core.errors.CairnError` into the MCP wire envelope.
+"""
+from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens
+from cairn.tools.find_mentions import find_mentions
+from cairn.tools.get_related import get_related
+from cairn.tools.get_section import expand, get_section
+from cairn.tools.outline import outline
+from cairn.tools.read_range import read_range
+from cairn.tools.search_keyword import search_keyword
+from cairn.tools.search_semantic import search_semantic
+__all__ = [
+    "DocumentIndex",
+    "ToolResponse",
+    "estimate_tokens",
+    "expand",
+    "find_mentions",
+    "get_related",
+    "get_section",
+    "outline",
+    "read_range",
+    "search_keyword",
+    "search_semantic",
+]

cairn/tools/base.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Shared types and helpers for retrieval tools."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel, ConfigDict, Field
+from cairn.core.errors import IndexBuildError, IndexNotFoundError
+from cairn.index.entities import Entities
+from cairn.index.summaries import Summaries
+from cairn.index.tree import Tree
+from cairn.index.vectors import Vectors
+from cairn.index.xrefs import XRefs
+class DocumentIndex:
+    """All sub-indexes loaded for a single document.
+    Tree, Summaries, and Vectors are required. Entities (v0.2.0+) and XRefs
+    (v0.2.2+) are optional; v0.1/early v0.2 indexes don't have them and
+    tools that need them must check ``index.entities`` / ``index.xrefs``
+    against ``None``.
+    """
+    def __init__(
+        self,
+        *,
+        tree: Tree,
+        summaries: Summaries,
+        vectors: Vectors,
+        entities: Entities | None = None,
+        xrefs: XRefs | None = None,
+    ) -> None:
+        doc_ids = {
+            "tree": tree.doc_id,
+            "summaries": summaries.doc_id,
+            "vectors": vectors.doc_id,
+        }
+        if entities is not None:
+            doc_ids["entities"] = entities.doc_id
+        if xrefs is not None:
+            doc_ids["xrefs"] = xrefs.doc_id
+        if len(set(doc_ids.values())) > 1:
+            msg = "sub-index doc_id mismatch: " + ", ".join(
+                f"{k}={v!r}" for k, v in doc_ids.items()
+            )
+            raise IndexBuildError(msg, details=doc_ids)
+        self.tree = tree
+        self.summaries = summaries
+        self.vectors = vectors
+        self.entities = entities
+        self.xrefs = xrefs
+        self.doc_id = tree.doc_id
+    @classmethod
+    def load(cls, doc_dir: Path) -> DocumentIndex:
+        """Load all sub-indexes from a single document directory.
+        Entities and XRefs are optional: older indexes don't have them, and
+        we degrade gracefully rather than refuse to load.
+        """
+        entities: Entities | None
+        try:
+            entities = Entities.load(doc_dir)
+        except IndexNotFoundError:
+            entities = None
+        xrefs: XRefs | None
+        try:
+            xrefs = XRefs.load(doc_dir)
+        except IndexNotFoundError:
+            xrefs = None
+        return cls(
+            tree=Tree.load(doc_dir),
+            summaries=Summaries.load(doc_dir),
+            vectors=Vectors.load(doc_dir),
+            entities=entities,
+            xrefs=xrefs,
+        )
+    def anchor(self, section_id: str) -> str:
+        """Build the canonical ``cairn://`` anchor for a section."""
+        return f"cairn://{self.doc_id}/{section_id}"
+class ToolResponse(BaseModel):
+    """Successful result of a tool invocation.
+    Errors are signaled by raising :class:`cairn.core.errors.CairnError` from
+    the tool function; the MCP server wraps them in the structured envelope
+    documented in ``docs/specs/mcp-tools.md`` §0.
+    """
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    data: dict[str, Any]
+    tokens_returned: int = Field(ge=0)
+def estimate_tokens(text: str) -> int:
+    """Estimate the token cost of a text payload.
+    Approximation: 1.3 tokens per whitespace-separated word, which tracks
+    common English tokenizers within ~10%. Good enough for budget reporting;
+    not a substitute for a real tokenizer.
+    """
+    if not text:
+        return 0
+    return max(1, int(len(text.split()) * 1.3))
+def estimate_tokens_of_payload(payload: Any) -> int:
+    """Estimate token cost of every string anywhere in ``payload``."""
+    return estimate_tokens(_flatten_text(payload))
+def _flatten_text(obj: Any) -> str:
+    if isinstance(obj, str):
+        return obj
+    if isinstance(obj, dict):
+        return " ".join(_flatten_text(v) for v in obj.values())
+    if isinstance(obj, list | tuple):
+        return " ".join(_flatten_text(item) for item in obj)
+    return ""

cairn/tools/find_mentions.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""``find_mentions`` retrieval tool.
+Spec: ``docs/specs/mcp-tools.md`` §6.
+Returns every section where a named entity occurs, with stable anchors back
+into the source. When the entity is unknown to the index, returns a
+successful envelope with an empty ``mentions`` array — "no mentions" is a
+valid answer, not an error condition.
+"""
+from __future__ import annotations
+from collections.abc import Sequence
+from typing import Any, Literal
+from cairn.core.errors import IndexNotFoundError, ToolError
+from cairn.core.types import EntityKind
+from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
+Kind = Literal["term", "code", "proper", "defined"]
+async def find_mentions(
+    index: DocumentIndex,
+    *,
+    entity: str,
+    scope: str | None = None,
+    kinds: Sequence[Kind] | None = None,
+) -> ToolResponse:
+    """Locate every section that mentions ``entity``.
+    The lookup matches by canonical form first, then by registered surface
+    forms. When ``kinds`` is supplied, only entities of those kinds are
+    considered.
+    """
+    if not entity.strip():
+        msg = "entity must be a non-empty string"
+        raise ToolError(msg)
+    if index.entities is None:
+        msg = (
+            "entities sub-index not built for this document; "
+            "re-index with v0.2 to enable find_mentions"
+        )
+        raise IndexNotFoundError(msg, details={"missing": "entities"})
+    kinds_tuple: tuple[EntityKind, ...] | None = None
+    if kinds is not None:
+        kinds_tuple = tuple(kinds)
+    ent = index.entities.lookup(entity, kinds=kinds_tuple)
+    if ent is None:
+        return ToolResponse(
+            data={
+                "entity": entity,
+                "canonical": None,
+                "kind": None,
+                "mentions": [],
+            },
+            tokens_returned=0,
+        )
+    mentions: list[dict[str, Any]] = []
+    for m in ent.mentions:
+        if scope is not None and not _matches_scope(m.section_id, scope):
+            continue
+        node = index.tree.get(m.section_id)
+        if node is None:
+            # Stale extractor output — skip rather than fail the whole call.
+            continue
+        mentions.append(
+            {
+                "section_id": m.section_id,
+                "title": node.title,
+                "anchor": index.anchor(m.section_id),
+                "span": [m.span.start, m.span.end],
+            }
+        )
+    payload: dict[str, Any] = {
+        "entity": entity,
+        "canonical": ent.canonical,
+        "kind": ent.kind,
+        "mentions": mentions,
+    }
+    return ToolResponse(
+        data=payload,
+        tokens_returned=estimate_tokens_of_payload(payload),
+    )
+def _matches_scope(section_id: str, scope: str) -> bool:
+    return section_id == scope or section_id.startswith(scope + "/")