docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Cairn — structure-aware, MCP-native retrieval for large documents."""
2
+
3
+ __version__ = "0.1.0a2"
4
+
5
+ __all__ = ["__version__"]
@@ -0,0 +1,37 @@
1
+ """cairn-bench — evaluate Cairn against a naive vector-RAG baseline.
2
+
3
+ A small framework for measuring retrieval recall and token cost on
4
+ hand-curated question sets. The shipped starter dataset is small;
5
+ production-grade evaluation requires more curation than fits in this
6
+ codebase. The framework is the contribution; the dataset is a template.
7
+ """
8
+
9
+ from cairn.bench.dataset import (
10
+ BenchDocument,
11
+ BenchQuestion,
12
+ BenchSuite,
13
+ load_suite,
14
+ )
15
+ from cairn.bench.judge import LLMJudge
16
+ from cairn.bench.metrics import recall_at_k
17
+ from cairn.bench.report import (
18
+ BenchSummary,
19
+ QuestionResult,
20
+ format_markdown_report,
21
+ write_json_report,
22
+ )
23
+ from cairn.bench.runner import BenchRunner
24
+
25
+ __all__ = [
26
+ "BenchDocument",
27
+ "BenchQuestion",
28
+ "BenchRunner",
29
+ "BenchSuite",
30
+ "BenchSummary",
31
+ "LLMJudge",
32
+ "QuestionResult",
33
+ "format_markdown_report",
34
+ "load_suite",
35
+ "recall_at_k",
36
+ "write_json_report",
37
+ ]
@@ -0,0 +1,236 @@
1
+ """Naive RAG baseline.
2
+
3
+ Splits the source text into fixed N-word chunks, embeds each chunk into a
4
+ LanceDB table, and serves cosine-similarity retrieval. **Crucially, the
5
+ chunker is structure-blind** — chunks straddle headings, paragraphs, and
6
+ section boundaries. This is the failure mode Cairn is built to fix; the
7
+ baseline lets us measure the cost of that failure.
8
+
9
+ For comparable recall reporting against Cairn (which returns section ids),
10
+ each chunk is mapped to "the section whose span contains the chunk's
11
+ midpoint byte" at index time. Deepest section wins on overlap.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import asyncio
17
+ import re
18
+ import shutil
19
+ from collections.abc import Sequence
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ import lancedb
24
+ import pyarrow as pa
25
+ from pydantic import BaseModel, ConfigDict
26
+
27
+ from cairn.core.types import Document, SectionNode
28
+ from cairn.embed.base import Embedder
29
+ from cairn.index.vectors import l2_normalize
30
+
31
+ _WORD = re.compile(r"\S+")
32
+
33
+
34
+ class _Chunk(BaseModel):
35
+ model_config = ConfigDict(frozen=True, extra="forbid")
36
+
37
+ chunk_id: int
38
+ text: str
39
+ byte_start: int
40
+ byte_end: int
41
+ section_id: str | None
42
+
43
+
44
+ class NaiveHit(BaseModel):
45
+ """One retrieval result from the naive baseline."""
46
+
47
+ model_config = ConfigDict(frozen=True, extra="forbid")
48
+
49
+ chunk_id: int
50
+ section_id: str | None
51
+ text: str
52
+ score: float
53
+
54
+
55
+ def chunk_text(text: str, *, chunk_size_words: int = 512) -> list[tuple[int, int, str]]:
56
+ """Split ``text`` into chunks of approximately ``chunk_size_words`` words.
57
+
58
+ Returns ``(byte_start, byte_end, chunk_text)`` for each chunk. Word
59
+ boundaries are whitespace-delimited.
60
+ """
61
+ if chunk_size_words < 1:
62
+ msg = f"chunk_size_words must be >= 1; got {chunk_size_words}"
63
+ raise ValueError(msg)
64
+ words = list(_WORD.finditer(text))
65
+ if not words:
66
+ return []
67
+ chunks: list[tuple[int, int, str]] = []
68
+ for i in range(0, len(words), chunk_size_words):
69
+ start = words[i].start()
70
+ end = words[min(i + chunk_size_words - 1, len(words) - 1)].end()
71
+ chunks.append((start, end, text[start:end]))
72
+ return chunks
73
+
74
+
75
+ def assign_section(
76
+ chunk_start: int,
77
+ chunk_end: int,
78
+ sections: Sequence[SectionNode],
79
+ ) -> str | None:
80
+ """Return the deepest section whose ``span`` contains the chunk's midpoint."""
81
+ midpoint = (chunk_start + chunk_end) // 2
82
+ deepest: SectionNode | None = None
83
+ for section in sections:
84
+ if section.span.start <= midpoint < section.span.end and (
85
+ deepest is None or section.level > deepest.level
86
+ ):
87
+ deepest = section
88
+ return deepest.id if deepest is not None else None
89
+
90
+
91
+ class NaiveRAG:
92
+ """Structure-blind chunk + vector search baseline."""
93
+
94
+ name = "naive-rag"
95
+ table_name = "chunks"
96
+
97
+ def __init__(
98
+ self,
99
+ embedder: Embedder,
100
+ *,
101
+ chunk_size_words: int = 512,
102
+ batch_size: int = 32,
103
+ ) -> None:
104
+ if batch_size < 1:
105
+ msg = f"batch_size must be >= 1; got {batch_size}"
106
+ raise ValueError(msg)
107
+ self.embedder = embedder
108
+ self.chunk_size_words = chunk_size_words
109
+ self.batch_size = batch_size
110
+
111
+ async def index(
112
+ self,
113
+ document: Document,
114
+ source_text: str,
115
+ *,
116
+ out_dir: Path,
117
+ ) -> None:
118
+ chunks = self._build_chunks(document, source_text)
119
+ if not chunks:
120
+ await asyncio.to_thread(self._write_empty_table, out_dir)
121
+ return
122
+
123
+ vectors: list[list[float]] = []
124
+ for i in range(0, len(chunks), self.batch_size):
125
+ batch = chunks[i : i + self.batch_size]
126
+ raw = await self.embedder.embed([c.text for c in batch])
127
+ vectors.extend(l2_normalize(v) for v in raw)
128
+
129
+ await asyncio.to_thread(self._write_table, out_dir, chunks, vectors)
130
+
131
+ def _build_chunks(self, document: Document, source_text: str) -> list[_Chunk]:
132
+ sections = document.sections
133
+ chunks: list[_Chunk] = []
134
+ for chunk_id, (start, end, text) in enumerate(
135
+ chunk_text(source_text, chunk_size_words=self.chunk_size_words)
136
+ ):
137
+ chunks.append(
138
+ _Chunk(
139
+ chunk_id=chunk_id,
140
+ text=text,
141
+ byte_start=start,
142
+ byte_end=end,
143
+ section_id=assign_section(start, end, sections),
144
+ )
145
+ )
146
+ return chunks
147
+
148
+ def _write_table(
149
+ self,
150
+ out_dir: Path,
151
+ chunks: list[_Chunk],
152
+ vectors: list[list[float]],
153
+ ) -> None:
154
+ db_path = out_dir / "naive.lance"
155
+ if db_path.exists():
156
+ shutil.rmtree(db_path)
157
+ db = lancedb.connect(str(db_path))
158
+ schema = pa.schema(
159
+ [
160
+ pa.field("chunk_id", pa.int64()),
161
+ pa.field("section_id", pa.string()),
162
+ pa.field("text", pa.string()),
163
+ pa.field("vector", pa.list_(pa.float32(), self.embedder.dim)),
164
+ ]
165
+ )
166
+ table = db.create_table(self.table_name, schema=schema)
167
+ records: list[dict[str, Any]] = [
168
+ {
169
+ "chunk_id": c.chunk_id,
170
+ "section_id": c.section_id or "",
171
+ "text": c.text,
172
+ "vector": v,
173
+ }
174
+ for c, v in zip(chunks, vectors, strict=True)
175
+ ]
176
+ table.add(records)
177
+
178
+ def _write_empty_table(self, out_dir: Path) -> None:
179
+ db_path = out_dir / "naive.lance"
180
+ if db_path.exists():
181
+ shutil.rmtree(db_path)
182
+ db = lancedb.connect(str(db_path))
183
+ schema = pa.schema(
184
+ [
185
+ pa.field("chunk_id", pa.int64()),
186
+ pa.field("section_id", pa.string()),
187
+ pa.field("text", pa.string()),
188
+ pa.field("vector", pa.list_(pa.float32(), self.embedder.dim)),
189
+ ]
190
+ )
191
+ db.create_table(self.table_name, schema=schema)
192
+
193
+ async def retrieve(
194
+ self,
195
+ query: str,
196
+ *,
197
+ out_dir: Path,
198
+ k: int = 8,
199
+ ) -> list[NaiveHit]:
200
+ if k < 1:
201
+ msg = f"k must be >= 1; got {k}"
202
+ raise ValueError(msg)
203
+ embedded = await self.embedder.embed([query])
204
+ if not embedded:
205
+ return []
206
+ query_vec = l2_normalize(embedded[0])
207
+ return await asyncio.to_thread(self._sync_retrieve, out_dir, query_vec, k)
208
+
209
+ def _sync_retrieve(
210
+ self,
211
+ out_dir: Path,
212
+ query_vec: list[float],
213
+ k: int,
214
+ ) -> list[NaiveHit]:
215
+ db = lancedb.connect(str(out_dir / "naive.lance"))
216
+ table = db.open_table(self.table_name)
217
+ rows = (
218
+ table.search(query_vec)
219
+ .distance_type("cosine")
220
+ .limit(k)
221
+ .to_list()
222
+ )
223
+ hits: list[NaiveHit] = []
224
+ for row in rows:
225
+ distance = float(row["_distance"])
226
+ score = max(0.0, min(1.0, 1.0 - distance))
227
+ section_id = row.get("section_id") or None
228
+ hits.append(
229
+ NaiveHit(
230
+ chunk_id=int(row["chunk_id"]),
231
+ section_id=section_id,
232
+ text=str(row["text"]),
233
+ score=score,
234
+ )
235
+ )
236
+ return hits
cairn/bench/dataset.py ADDED
@@ -0,0 +1,109 @@
1
+ """Bench dataset format — Pydantic models + TOML loader.
2
+
3
+ A suite lives in a single TOML file (loaded via stdlib ``tomllib``). The
4
+ shape::
5
+
6
+ name = "Cairn Architecture v0.2.2"
7
+
8
+ [[documents]]
9
+ id = "architecture"
10
+ source = "ARCHITECTURE.md"
11
+
12
+ [[documents.questions]]
13
+ id = "vector-store"
14
+ question = "What is the default vector store?"
15
+ expected_anchors = ["2-5-vectors-v-semantic-overlay", "7-tech-stack"]
16
+ tags = ["definition", "tech-stack"]
17
+
18
+ ``expected_anchors`` use substring matching against retrieved section_ids
19
+ so authors can use short suffixes instead of full hierarchical slugs.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import tomllib
25
+ from pathlib import Path
26
+
27
+ from pydantic import BaseModel, ConfigDict, Field
28
+
29
+ from cairn.core.errors import ConfigError
30
+
31
+
32
+ class BenchQuestion(BaseModel):
33
+ """One bench question with ground-truth retrieval targets."""
34
+
35
+ model_config = ConfigDict(frozen=True, extra="forbid")
36
+
37
+ id: str
38
+ question: str
39
+ expected_anchors: tuple[str, ...] = Field(
40
+ default=(),
41
+ description=(
42
+ "Substrings to match against retrieved section_ids. "
43
+ "Empty tuple = no recall computed for this question."
44
+ ),
45
+ )
46
+ tags: tuple[str, ...] = ()
47
+ reference: str | None = Field(
48
+ default=None,
49
+ description="Optional free-form reference answer for LLM-judged QA.",
50
+ )
51
+
52
+
53
+ class BenchDocument(BaseModel):
54
+ """One source document and the questions asked of it."""
55
+
56
+ model_config = ConfigDict(frozen=True, extra="forbid")
57
+
58
+ id: str
59
+ source: Path = Field(
60
+ description="Path to the source document, relative to the suite file."
61
+ )
62
+ questions: tuple[BenchQuestion, ...]
63
+
64
+
65
+ class BenchSuite(BaseModel):
66
+ """A complete benchmark suite."""
67
+
68
+ model_config = ConfigDict(frozen=True, extra="forbid")
69
+
70
+ name: str
71
+ documents: tuple[BenchDocument, ...]
72
+
73
+
74
+ def load_suite(path: Path) -> BenchSuite:
75
+ """Parse a TOML bench file. Source paths are resolved relative to it."""
76
+ if not path.exists():
77
+ msg = f"bench suite not found: {path}"
78
+ raise ConfigError(msg, details={"path": str(path)})
79
+
80
+ with path.open("rb") as fh:
81
+ payload = tomllib.load(fh)
82
+
83
+ suite_dir = path.parent
84
+ documents_in = payload.get("documents", [])
85
+ documents: list[BenchDocument] = []
86
+ for doc in documents_in:
87
+ questions_in = doc.get("questions", [])
88
+ questions = [
89
+ BenchQuestion(
90
+ id=q["id"],
91
+ question=q["question"],
92
+ expected_anchors=tuple(q.get("expected_anchors", ())),
93
+ tags=tuple(q.get("tags", ())),
94
+ reference=q.get("reference"),
95
+ )
96
+ for q in questions_in
97
+ ]
98
+ documents.append(
99
+ BenchDocument(
100
+ id=doc["id"],
101
+ source=(suite_dir / doc["source"]).resolve(),
102
+ questions=tuple(questions),
103
+ )
104
+ )
105
+
106
+ return BenchSuite(
107
+ name=payload.get("name", path.stem),
108
+ documents=tuple(documents),
109
+ )
cairn/bench/judge.py ADDED
@@ -0,0 +1,126 @@
1
+ """LLM-as-judge: optional QA accuracy dimension for cairn-bench.
2
+
3
+ For each question + retrieved context the judge:
4
+
5
+ 1. Asks an LLM to answer the question using only the provided context.
6
+ 2. Asks the same LLM to evaluate whether that answer matches the
7
+ reference answer the suite author supplied.
8
+
9
+ Both calls go through an OpenAI-compatible ``/v1/chat/completions``
10
+ endpoint, so the judge runs against Ollama (default), OpenAI, vLLM, etc.
11
+
12
+ The judge is **optional**: when no ``LLMJudge`` is configured on the
13
+ :class:`cairn.bench.runner.BenchRunner`, QA accuracy is simply not
14
+ reported. Recall and token cost still come out either way.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from typing import Any
20
+
21
+ import httpx
22
+
23
+ from cairn.core.errors import IndexBuildError
24
+
25
+ _ANSWER_SYSTEM = (
26
+ "You answer questions strictly from the provided context. "
27
+ 'If the context is insufficient, reply with "I don\'t know."'
28
+ )
29
+
30
+ _JUDGE_SYSTEM = (
31
+ "You evaluate whether an AI assistant's answer is correct given a "
32
+ "reference answer. Reply with YES or NO on the first line, then one "
33
+ "sentence of justification."
34
+ )
35
+
36
+
37
+ class LLMJudge:
38
+ """Generates answers from retrieved context and judges them against a reference."""
39
+
40
+ def __init__(
41
+ self,
42
+ *,
43
+ base_url: str = "http://localhost:11434/v1",
44
+ model: str = "llama3.2:3b",
45
+ api_key: str | None = None,
46
+ timeout: float = 60.0,
47
+ temperature: float = 0.0,
48
+ ) -> None:
49
+ self.base_url = base_url.rstrip("/")
50
+ self.model = model
51
+ self.api_key = api_key
52
+ self.timeout = timeout
53
+ self.temperature = temperature
54
+ self.name = f"openai-compat:{model}"
55
+
56
+ async def answer(self, question: str, context: str) -> str:
57
+ """Produce an answer to ``question`` using only ``context``."""
58
+ prompt = (
59
+ "Context:\n"
60
+ f"{context.strip() or '(no context provided)'}\n\n"
61
+ f"Question: {question}\n\nAnswer:"
62
+ )
63
+ return await self._chat(
64
+ [
65
+ {"role": "system", "content": _ANSWER_SYSTEM},
66
+ {"role": "user", "content": prompt},
67
+ ]
68
+ )
69
+
70
+ async def judge(
71
+ self,
72
+ question: str,
73
+ reference: str,
74
+ answer: str,
75
+ ) -> tuple[bool, str]:
76
+ """Return ``(is_correct, raw_response)`` for one (question, answer) pair."""
77
+ prompt = (
78
+ f"Question: {question}\n\n"
79
+ f"Reference answer: {reference}\n\n"
80
+ f"Assistant's answer: {answer}\n\n"
81
+ "Is the assistant's answer correct?"
82
+ )
83
+ raw = await self._chat(
84
+ [
85
+ {"role": "system", "content": _JUDGE_SYSTEM},
86
+ {"role": "user", "content": prompt},
87
+ ]
88
+ )
89
+ first_line = raw.strip().split("\n", 1)[0].strip().upper()
90
+ is_correct = first_line.startswith("YES")
91
+ return is_correct, raw
92
+
93
+ async def _chat(self, messages: list[dict[str, str]]) -> str:
94
+ headers = {"Content-Type": "application/json"}
95
+ if self.api_key:
96
+ headers["Authorization"] = f"Bearer {self.api_key}"
97
+ payload: dict[str, Any] = {
98
+ "model": self.model,
99
+ "temperature": self.temperature,
100
+ "messages": messages,
101
+ }
102
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
103
+ response = await client.post(
104
+ f"{self.base_url}/chat/completions",
105
+ json=payload,
106
+ headers=headers,
107
+ )
108
+ if response.status_code >= 400:
109
+ msg = (
110
+ f"judge endpoint returned HTTP {response.status_code}: "
111
+ f"{response.text[:200]}"
112
+ )
113
+ raise IndexBuildError(
114
+ msg,
115
+ details={
116
+ "status": response.status_code,
117
+ "model": self.model,
118
+ "base_url": self.base_url,
119
+ },
120
+ )
121
+ data = response.json()
122
+ try:
123
+ return str(data["choices"][0]["message"]["content"]).strip()
124
+ except (KeyError, IndexError, TypeError) as exc:
125
+ msg = "judge response did not match OpenAI chat-completions shape"
126
+ raise IndexBuildError(msg, details={"response": data}) from exc
cairn/bench/metrics.py ADDED
@@ -0,0 +1,32 @@
1
+ """Recall@k computation for bench questions.
2
+
3
+ An ``expected_anchor`` is considered matched if it appears as a substring
4
+ of any retrieved section id within the top ``k`` results. This is a
5
+ deliberate convenience: authoring full hierarchical slugs in YAML/TOML is
6
+ brittle, so we accept short suffixes that uniquely identify a section.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Sequence
12
+
13
+
14
+ def recall_at_k(
15
+ retrieved_section_ids: Sequence[str],
16
+ expected_anchors: Sequence[str],
17
+ *,
18
+ k: int,
19
+ ) -> float:
20
+ """Return the fraction of expected anchors found in the top-k retrieval.
21
+
22
+ Returns ``1.0`` when ``expected_anchors`` is empty (vacuously true).
23
+ """
24
+ if not expected_anchors:
25
+ return 1.0
26
+
27
+ top_k = retrieved_section_ids[:k]
28
+ matched = 0
29
+ for expected in expected_anchors:
30
+ if any(expected in retrieved for retrieved in top_k):
31
+ matched += 1
32
+ return matched / len(expected_anchors)
cairn/bench/report.py ADDED
@@ -0,0 +1,143 @@
1
+ """Bench result types + JSON / markdown report writers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import statistics
7
+ from collections.abc import Iterable
8
+ from datetime import UTC, datetime
9
+ from pathlib import Path
10
+ from typing import Literal
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ System = Literal["cairn", "naive"]
15
+
16
+
17
+ class SystemResult(BaseModel):
18
+ """One system's result for one question."""
19
+
20
+ model_config = ConfigDict(frozen=True, extra="forbid")
21
+
22
+ system: System
23
+ section_ids: tuple[str, ...]
24
+ recall_at_k: float = Field(ge=0.0, le=1.0)
25
+ tokens_returned: int = Field(ge=0)
26
+
27
+
28
+ class QuestionResult(BaseModel):
29
+ """A question's results across both systems."""
30
+
31
+ model_config = ConfigDict(frozen=True, extra="forbid")
32
+
33
+ document_id: str
34
+ question_id: str
35
+ question: str
36
+ expected_anchors: tuple[str, ...]
37
+ tags: tuple[str, ...]
38
+ cairn: SystemResult
39
+ naive: SystemResult
40
+
41
+
42
+ class BenchSummary(BaseModel):
43
+ """Full bench output."""
44
+
45
+ model_config = ConfigDict(frozen=True, extra="forbid")
46
+
47
+ suite_name: str
48
+ k: int
49
+ questions: tuple[QuestionResult, ...]
50
+
51
+ def cairn_mean_recall(self) -> float:
52
+ return _mean(q.cairn.recall_at_k for q in self.questions)
53
+
54
+ def naive_mean_recall(self) -> float:
55
+ return _mean(q.naive.recall_at_k for q in self.questions)
56
+
57
+ def cairn_mean_tokens(self) -> float:
58
+ return _mean(float(q.cairn.tokens_returned) for q in self.questions)
59
+
60
+ def naive_mean_tokens(self) -> float:
61
+ return _mean(float(q.naive.tokens_returned) for q in self.questions)
62
+
63
+
64
+ def _mean(values: Iterable[float]) -> float:
65
+ materialized = list(values)
66
+ return statistics.mean(materialized) if materialized else 0.0
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Report writers
71
+ # ---------------------------------------------------------------------------
72
+
73
+
74
+ def write_json_report(summary: BenchSummary, path: Path) -> Path:
75
+ """Write a deterministic JSON report next to the bench's working dir."""
76
+ path.parent.mkdir(parents=True, exist_ok=True)
77
+ payload = {
78
+ "generated_at": datetime.now(UTC).isoformat(),
79
+ "suite": summary.suite_name,
80
+ "k": summary.k,
81
+ "summary": {
82
+ "cairn_mean_recall_at_k": summary.cairn_mean_recall(),
83
+ "naive_mean_recall_at_k": summary.naive_mean_recall(),
84
+ "cairn_mean_tokens": summary.cairn_mean_tokens(),
85
+ "naive_mean_tokens": summary.naive_mean_tokens(),
86
+ },
87
+ "questions": [q.model_dump(mode="json") for q in summary.questions],
88
+ }
89
+ with path.open("w", encoding="utf-8") as fh:
90
+ json.dump(payload, fh, ensure_ascii=False, indent=2)
91
+ fh.write("\n")
92
+ return path
93
+
94
+
95
+ def format_markdown_report(summary: BenchSummary) -> str:
96
+ """Render a human-readable comparison table."""
97
+ n = len(summary.questions)
98
+ cairn_recall = summary.cairn_mean_recall()
99
+ naive_recall = summary.naive_mean_recall()
100
+ cairn_tokens = summary.cairn_mean_tokens()
101
+ naive_tokens = summary.naive_mean_tokens()
102
+
103
+ token_ratio = (
104
+ cairn_tokens / naive_tokens if naive_tokens > 0 else float("nan")
105
+ )
106
+
107
+ lines: list[str] = []
108
+ lines.append(f"# {summary.suite_name}")
109
+ lines.append("")
110
+ lines.append(f"Questions: **{n}** · k = **{summary.k}**")
111
+ lines.append("")
112
+ lines.append("## Headline")
113
+ lines.append("")
114
+ lines.append("| metric | naive vector RAG | Cairn |")
115
+ lines.append("|---|---:|---:|")
116
+ lines.append(
117
+ f"| mean recall@{summary.k} | {naive_recall:.2%} | **{cairn_recall:.2%}** |"
118
+ )
119
+ lines.append(
120
+ f"| mean tokens returned | {naive_tokens:,.0f} | **{cairn_tokens:,.0f}** "
121
+ f"({token_ratio:.1%} of naive) |"
122
+ )
123
+ lines.append("")
124
+
125
+ lines.append("## Per-question")
126
+ lines.append("")
127
+ lines.append("| question | recall (cairn) | recall (naive) | tokens (cairn / naive) |")
128
+ lines.append("|---|---:|---:|---:|")
129
+ for q in summary.questions:
130
+ lines.append(
131
+ f"| `{q.question_id}` {_truncate(q.question, 60)} "
132
+ f"| {q.cairn.recall_at_k:.0%} "
133
+ f"| {q.naive.recall_at_k:.0%} "
134
+ f"| {q.cairn.tokens_returned} / {q.naive.tokens_returned} |"
135
+ )
136
+
137
+ return "\n".join(lines) + "\n"
138
+
139
+
140
+ def _truncate(text: str, width: int) -> str:
141
+ if len(text) <= width:
142
+ return text
143
+ return text[: width - 1] + "…"