docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/__init__.py
ADDED
cairn/bench/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""cairn-bench — evaluate Cairn against a naive vector-RAG baseline.
|
|
2
|
+
|
|
3
|
+
A small framework for measuring retrieval recall and token cost on
|
|
4
|
+
hand-curated question sets. The shipped starter dataset is small;
|
|
5
|
+
production-grade evaluation requires more curation than fits in this
|
|
6
|
+
codebase. The framework is the contribution; the dataset is a template.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from cairn.bench.dataset import (
|
|
10
|
+
BenchDocument,
|
|
11
|
+
BenchQuestion,
|
|
12
|
+
BenchSuite,
|
|
13
|
+
load_suite,
|
|
14
|
+
)
|
|
15
|
+
from cairn.bench.judge import LLMJudge
|
|
16
|
+
from cairn.bench.metrics import recall_at_k
|
|
17
|
+
from cairn.bench.report import (
|
|
18
|
+
BenchSummary,
|
|
19
|
+
QuestionResult,
|
|
20
|
+
format_markdown_report,
|
|
21
|
+
write_json_report,
|
|
22
|
+
)
|
|
23
|
+
from cairn.bench.runner import BenchRunner
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"BenchDocument",
|
|
27
|
+
"BenchQuestion",
|
|
28
|
+
"BenchRunner",
|
|
29
|
+
"BenchSuite",
|
|
30
|
+
"BenchSummary",
|
|
31
|
+
"LLMJudge",
|
|
32
|
+
"QuestionResult",
|
|
33
|
+
"format_markdown_report",
|
|
34
|
+
"load_suite",
|
|
35
|
+
"recall_at_k",
|
|
36
|
+
"write_json_report",
|
|
37
|
+
]
|
cairn/bench/baseline.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Naive RAG baseline.
|
|
2
|
+
|
|
3
|
+
Splits the source text into fixed N-word chunks, embeds each chunk into a
|
|
4
|
+
LanceDB table, and serves cosine-similarity retrieval. **Crucially, the
|
|
5
|
+
chunker is structure-blind** — chunks straddle headings, paragraphs, and
|
|
6
|
+
section boundaries. This is the failure mode Cairn is built to fix; the
|
|
7
|
+
baseline lets us measure the cost of that failure.
|
|
8
|
+
|
|
9
|
+
For comparable recall reporting against Cairn (which returns section ids),
|
|
10
|
+
each chunk is mapped to "the section whose span contains the chunk's
|
|
11
|
+
midpoint byte" at index time. Deepest section wins on overlap.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import re
|
|
18
|
+
import shutil
|
|
19
|
+
from collections.abc import Sequence
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import lancedb
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
from pydantic import BaseModel, ConfigDict
|
|
26
|
+
|
|
27
|
+
from cairn.core.types import Document, SectionNode
|
|
28
|
+
from cairn.embed.base import Embedder
|
|
29
|
+
from cairn.index.vectors import l2_normalize
|
|
30
|
+
|
|
31
|
+
_WORD = re.compile(r"\S+")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _Chunk(BaseModel):
|
|
35
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
36
|
+
|
|
37
|
+
chunk_id: int
|
|
38
|
+
text: str
|
|
39
|
+
byte_start: int
|
|
40
|
+
byte_end: int
|
|
41
|
+
section_id: str | None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class NaiveHit(BaseModel):
|
|
45
|
+
"""One retrieval result from the naive baseline."""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
48
|
+
|
|
49
|
+
chunk_id: int
|
|
50
|
+
section_id: str | None
|
|
51
|
+
text: str
|
|
52
|
+
score: float
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def chunk_text(text: str, *, chunk_size_words: int = 512) -> list[tuple[int, int, str]]:
|
|
56
|
+
"""Split ``text`` into chunks of approximately ``chunk_size_words`` words.
|
|
57
|
+
|
|
58
|
+
Returns ``(byte_start, byte_end, chunk_text)`` for each chunk. Word
|
|
59
|
+
boundaries are whitespace-delimited.
|
|
60
|
+
"""
|
|
61
|
+
if chunk_size_words < 1:
|
|
62
|
+
msg = f"chunk_size_words must be >= 1; got {chunk_size_words}"
|
|
63
|
+
raise ValueError(msg)
|
|
64
|
+
words = list(_WORD.finditer(text))
|
|
65
|
+
if not words:
|
|
66
|
+
return []
|
|
67
|
+
chunks: list[tuple[int, int, str]] = []
|
|
68
|
+
for i in range(0, len(words), chunk_size_words):
|
|
69
|
+
start = words[i].start()
|
|
70
|
+
end = words[min(i + chunk_size_words - 1, len(words) - 1)].end()
|
|
71
|
+
chunks.append((start, end, text[start:end]))
|
|
72
|
+
return chunks
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def assign_section(
|
|
76
|
+
chunk_start: int,
|
|
77
|
+
chunk_end: int,
|
|
78
|
+
sections: Sequence[SectionNode],
|
|
79
|
+
) -> str | None:
|
|
80
|
+
"""Return the deepest section whose ``span`` contains the chunk's midpoint."""
|
|
81
|
+
midpoint = (chunk_start + chunk_end) // 2
|
|
82
|
+
deepest: SectionNode | None = None
|
|
83
|
+
for section in sections:
|
|
84
|
+
if section.span.start <= midpoint < section.span.end and (
|
|
85
|
+
deepest is None or section.level > deepest.level
|
|
86
|
+
):
|
|
87
|
+
deepest = section
|
|
88
|
+
return deepest.id if deepest is not None else None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class NaiveRAG:
|
|
92
|
+
"""Structure-blind chunk + vector search baseline."""
|
|
93
|
+
|
|
94
|
+
name = "naive-rag"
|
|
95
|
+
table_name = "chunks"
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
embedder: Embedder,
|
|
100
|
+
*,
|
|
101
|
+
chunk_size_words: int = 512,
|
|
102
|
+
batch_size: int = 32,
|
|
103
|
+
) -> None:
|
|
104
|
+
if batch_size < 1:
|
|
105
|
+
msg = f"batch_size must be >= 1; got {batch_size}"
|
|
106
|
+
raise ValueError(msg)
|
|
107
|
+
self.embedder = embedder
|
|
108
|
+
self.chunk_size_words = chunk_size_words
|
|
109
|
+
self.batch_size = batch_size
|
|
110
|
+
|
|
111
|
+
async def index(
|
|
112
|
+
self,
|
|
113
|
+
document: Document,
|
|
114
|
+
source_text: str,
|
|
115
|
+
*,
|
|
116
|
+
out_dir: Path,
|
|
117
|
+
) -> None:
|
|
118
|
+
chunks = self._build_chunks(document, source_text)
|
|
119
|
+
if not chunks:
|
|
120
|
+
await asyncio.to_thread(self._write_empty_table, out_dir)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
vectors: list[list[float]] = []
|
|
124
|
+
for i in range(0, len(chunks), self.batch_size):
|
|
125
|
+
batch = chunks[i : i + self.batch_size]
|
|
126
|
+
raw = await self.embedder.embed([c.text for c in batch])
|
|
127
|
+
vectors.extend(l2_normalize(v) for v in raw)
|
|
128
|
+
|
|
129
|
+
await asyncio.to_thread(self._write_table, out_dir, chunks, vectors)
|
|
130
|
+
|
|
131
|
+
def _build_chunks(self, document: Document, source_text: str) -> list[_Chunk]:
|
|
132
|
+
sections = document.sections
|
|
133
|
+
chunks: list[_Chunk] = []
|
|
134
|
+
for chunk_id, (start, end, text) in enumerate(
|
|
135
|
+
chunk_text(source_text, chunk_size_words=self.chunk_size_words)
|
|
136
|
+
):
|
|
137
|
+
chunks.append(
|
|
138
|
+
_Chunk(
|
|
139
|
+
chunk_id=chunk_id,
|
|
140
|
+
text=text,
|
|
141
|
+
byte_start=start,
|
|
142
|
+
byte_end=end,
|
|
143
|
+
section_id=assign_section(start, end, sections),
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
return chunks
|
|
147
|
+
|
|
148
|
+
def _write_table(
|
|
149
|
+
self,
|
|
150
|
+
out_dir: Path,
|
|
151
|
+
chunks: list[_Chunk],
|
|
152
|
+
vectors: list[list[float]],
|
|
153
|
+
) -> None:
|
|
154
|
+
db_path = out_dir / "naive.lance"
|
|
155
|
+
if db_path.exists():
|
|
156
|
+
shutil.rmtree(db_path)
|
|
157
|
+
db = lancedb.connect(str(db_path))
|
|
158
|
+
schema = pa.schema(
|
|
159
|
+
[
|
|
160
|
+
pa.field("chunk_id", pa.int64()),
|
|
161
|
+
pa.field("section_id", pa.string()),
|
|
162
|
+
pa.field("text", pa.string()),
|
|
163
|
+
pa.field("vector", pa.list_(pa.float32(), self.embedder.dim)),
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
table = db.create_table(self.table_name, schema=schema)
|
|
167
|
+
records: list[dict[str, Any]] = [
|
|
168
|
+
{
|
|
169
|
+
"chunk_id": c.chunk_id,
|
|
170
|
+
"section_id": c.section_id or "",
|
|
171
|
+
"text": c.text,
|
|
172
|
+
"vector": v,
|
|
173
|
+
}
|
|
174
|
+
for c, v in zip(chunks, vectors, strict=True)
|
|
175
|
+
]
|
|
176
|
+
table.add(records)
|
|
177
|
+
|
|
178
|
+
def _write_empty_table(self, out_dir: Path) -> None:
|
|
179
|
+
db_path = out_dir / "naive.lance"
|
|
180
|
+
if db_path.exists():
|
|
181
|
+
shutil.rmtree(db_path)
|
|
182
|
+
db = lancedb.connect(str(db_path))
|
|
183
|
+
schema = pa.schema(
|
|
184
|
+
[
|
|
185
|
+
pa.field("chunk_id", pa.int64()),
|
|
186
|
+
pa.field("section_id", pa.string()),
|
|
187
|
+
pa.field("text", pa.string()),
|
|
188
|
+
pa.field("vector", pa.list_(pa.float32(), self.embedder.dim)),
|
|
189
|
+
]
|
|
190
|
+
)
|
|
191
|
+
db.create_table(self.table_name, schema=schema)
|
|
192
|
+
|
|
193
|
+
async def retrieve(
|
|
194
|
+
self,
|
|
195
|
+
query: str,
|
|
196
|
+
*,
|
|
197
|
+
out_dir: Path,
|
|
198
|
+
k: int = 8,
|
|
199
|
+
) -> list[NaiveHit]:
|
|
200
|
+
if k < 1:
|
|
201
|
+
msg = f"k must be >= 1; got {k}"
|
|
202
|
+
raise ValueError(msg)
|
|
203
|
+
embedded = await self.embedder.embed([query])
|
|
204
|
+
if not embedded:
|
|
205
|
+
return []
|
|
206
|
+
query_vec = l2_normalize(embedded[0])
|
|
207
|
+
return await asyncio.to_thread(self._sync_retrieve, out_dir, query_vec, k)
|
|
208
|
+
|
|
209
|
+
def _sync_retrieve(
|
|
210
|
+
self,
|
|
211
|
+
out_dir: Path,
|
|
212
|
+
query_vec: list[float],
|
|
213
|
+
k: int,
|
|
214
|
+
) -> list[NaiveHit]:
|
|
215
|
+
db = lancedb.connect(str(out_dir / "naive.lance"))
|
|
216
|
+
table = db.open_table(self.table_name)
|
|
217
|
+
rows = (
|
|
218
|
+
table.search(query_vec)
|
|
219
|
+
.distance_type("cosine")
|
|
220
|
+
.limit(k)
|
|
221
|
+
.to_list()
|
|
222
|
+
)
|
|
223
|
+
hits: list[NaiveHit] = []
|
|
224
|
+
for row in rows:
|
|
225
|
+
distance = float(row["_distance"])
|
|
226
|
+
score = max(0.0, min(1.0, 1.0 - distance))
|
|
227
|
+
section_id = row.get("section_id") or None
|
|
228
|
+
hits.append(
|
|
229
|
+
NaiveHit(
|
|
230
|
+
chunk_id=int(row["chunk_id"]),
|
|
231
|
+
section_id=section_id,
|
|
232
|
+
text=str(row["text"]),
|
|
233
|
+
score=score,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
return hits
|
cairn/bench/dataset.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Bench dataset format — Pydantic models + TOML loader.
|
|
2
|
+
|
|
3
|
+
A suite lives in a single TOML file (loaded via stdlib ``tomllib``). The
|
|
4
|
+
shape::
|
|
5
|
+
|
|
6
|
+
name = "Cairn Architecture v0.2.2"
|
|
7
|
+
|
|
8
|
+
[[documents]]
|
|
9
|
+
id = "architecture"
|
|
10
|
+
source = "ARCHITECTURE.md"
|
|
11
|
+
|
|
12
|
+
[[documents.questions]]
|
|
13
|
+
id = "vector-store"
|
|
14
|
+
question = "What is the default vector store?"
|
|
15
|
+
expected_anchors = ["2-5-vectors-v-semantic-overlay", "7-tech-stack"]
|
|
16
|
+
tags = ["definition", "tech-stack"]
|
|
17
|
+
|
|
18
|
+
``expected_anchors`` use substring matching against retrieved section_ids
|
|
19
|
+
so authors can use short suffixes instead of full hierarchical slugs.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import tomllib
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
28
|
+
|
|
29
|
+
from cairn.core.errors import ConfigError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BenchQuestion(BaseModel):
|
|
33
|
+
"""One bench question with ground-truth retrieval targets."""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
36
|
+
|
|
37
|
+
id: str
|
|
38
|
+
question: str
|
|
39
|
+
expected_anchors: tuple[str, ...] = Field(
|
|
40
|
+
default=(),
|
|
41
|
+
description=(
|
|
42
|
+
"Substrings to match against retrieved section_ids. "
|
|
43
|
+
"Empty tuple = no recall computed for this question."
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
tags: tuple[str, ...] = ()
|
|
47
|
+
reference: str | None = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Optional free-form reference answer for LLM-judged QA.",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BenchDocument(BaseModel):
|
|
54
|
+
"""One source document and the questions asked of it."""
|
|
55
|
+
|
|
56
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
57
|
+
|
|
58
|
+
id: str
|
|
59
|
+
source: Path = Field(
|
|
60
|
+
description="Path to the source document, relative to the suite file."
|
|
61
|
+
)
|
|
62
|
+
questions: tuple[BenchQuestion, ...]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class BenchSuite(BaseModel):
|
|
66
|
+
"""A complete benchmark suite."""
|
|
67
|
+
|
|
68
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
69
|
+
|
|
70
|
+
name: str
|
|
71
|
+
documents: tuple[BenchDocument, ...]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_suite(path: Path) -> BenchSuite:
|
|
75
|
+
"""Parse a TOML bench file. Source paths are resolved relative to it."""
|
|
76
|
+
if not path.exists():
|
|
77
|
+
msg = f"bench suite not found: {path}"
|
|
78
|
+
raise ConfigError(msg, details={"path": str(path)})
|
|
79
|
+
|
|
80
|
+
with path.open("rb") as fh:
|
|
81
|
+
payload = tomllib.load(fh)
|
|
82
|
+
|
|
83
|
+
suite_dir = path.parent
|
|
84
|
+
documents_in = payload.get("documents", [])
|
|
85
|
+
documents: list[BenchDocument] = []
|
|
86
|
+
for doc in documents_in:
|
|
87
|
+
questions_in = doc.get("questions", [])
|
|
88
|
+
questions = [
|
|
89
|
+
BenchQuestion(
|
|
90
|
+
id=q["id"],
|
|
91
|
+
question=q["question"],
|
|
92
|
+
expected_anchors=tuple(q.get("expected_anchors", ())),
|
|
93
|
+
tags=tuple(q.get("tags", ())),
|
|
94
|
+
reference=q.get("reference"),
|
|
95
|
+
)
|
|
96
|
+
for q in questions_in
|
|
97
|
+
]
|
|
98
|
+
documents.append(
|
|
99
|
+
BenchDocument(
|
|
100
|
+
id=doc["id"],
|
|
101
|
+
source=(suite_dir / doc["source"]).resolve(),
|
|
102
|
+
questions=tuple(questions),
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return BenchSuite(
|
|
107
|
+
name=payload.get("name", path.stem),
|
|
108
|
+
documents=tuple(documents),
|
|
109
|
+
)
|
cairn/bench/judge.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""LLM-as-judge: optional QA accuracy dimension for cairn-bench.
|
|
2
|
+
|
|
3
|
+
For each question + retrieved context the judge:
|
|
4
|
+
|
|
5
|
+
1. Asks an LLM to answer the question using only the provided context.
|
|
6
|
+
2. Asks the same LLM to evaluate whether that answer matches the
|
|
7
|
+
reference answer the suite author supplied.
|
|
8
|
+
|
|
9
|
+
Both calls go through an OpenAI-compatible ``/v1/chat/completions``
|
|
10
|
+
endpoint, so the judge runs against Ollama (default), OpenAI, vLLM, etc.
|
|
11
|
+
|
|
12
|
+
The judge is **optional**: when no ``LLMJudge`` is configured on the
|
|
13
|
+
:class:`cairn.bench.runner.BenchRunner`, QA accuracy is simply not
|
|
14
|
+
reported. Recall and token cost still come out either way.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
|
|
23
|
+
from cairn.core.errors import IndexBuildError
|
|
24
|
+
|
|
25
|
+
_ANSWER_SYSTEM = (
|
|
26
|
+
"You answer questions strictly from the provided context. "
|
|
27
|
+
'If the context is insufficient, reply with "I don\'t know."'
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
_JUDGE_SYSTEM = (
|
|
31
|
+
"You evaluate whether an AI assistant's answer is correct given a "
|
|
32
|
+
"reference answer. Reply with YES or NO on the first line, then one "
|
|
33
|
+
"sentence of justification."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LLMJudge:
|
|
38
|
+
"""Generates answers from retrieved context and judges them against a reference."""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
*,
|
|
43
|
+
base_url: str = "http://localhost:11434/v1",
|
|
44
|
+
model: str = "llama3.2:3b",
|
|
45
|
+
api_key: str | None = None,
|
|
46
|
+
timeout: float = 60.0,
|
|
47
|
+
temperature: float = 0.0,
|
|
48
|
+
) -> None:
|
|
49
|
+
self.base_url = base_url.rstrip("/")
|
|
50
|
+
self.model = model
|
|
51
|
+
self.api_key = api_key
|
|
52
|
+
self.timeout = timeout
|
|
53
|
+
self.temperature = temperature
|
|
54
|
+
self.name = f"openai-compat:{model}"
|
|
55
|
+
|
|
56
|
+
async def answer(self, question: str, context: str) -> str:
|
|
57
|
+
"""Produce an answer to ``question`` using only ``context``."""
|
|
58
|
+
prompt = (
|
|
59
|
+
"Context:\n"
|
|
60
|
+
f"{context.strip() or '(no context provided)'}\n\n"
|
|
61
|
+
f"Question: {question}\n\nAnswer:"
|
|
62
|
+
)
|
|
63
|
+
return await self._chat(
|
|
64
|
+
[
|
|
65
|
+
{"role": "system", "content": _ANSWER_SYSTEM},
|
|
66
|
+
{"role": "user", "content": prompt},
|
|
67
|
+
]
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
async def judge(
|
|
71
|
+
self,
|
|
72
|
+
question: str,
|
|
73
|
+
reference: str,
|
|
74
|
+
answer: str,
|
|
75
|
+
) -> tuple[bool, str]:
|
|
76
|
+
"""Return ``(is_correct, raw_response)`` for one (question, answer) pair."""
|
|
77
|
+
prompt = (
|
|
78
|
+
f"Question: {question}\n\n"
|
|
79
|
+
f"Reference answer: {reference}\n\n"
|
|
80
|
+
f"Assistant's answer: {answer}\n\n"
|
|
81
|
+
"Is the assistant's answer correct?"
|
|
82
|
+
)
|
|
83
|
+
raw = await self._chat(
|
|
84
|
+
[
|
|
85
|
+
{"role": "system", "content": _JUDGE_SYSTEM},
|
|
86
|
+
{"role": "user", "content": prompt},
|
|
87
|
+
]
|
|
88
|
+
)
|
|
89
|
+
first_line = raw.strip().split("\n", 1)[0].strip().upper()
|
|
90
|
+
is_correct = first_line.startswith("YES")
|
|
91
|
+
return is_correct, raw
|
|
92
|
+
|
|
93
|
+
async def _chat(self, messages: list[dict[str, str]]) -> str:
|
|
94
|
+
headers = {"Content-Type": "application/json"}
|
|
95
|
+
if self.api_key:
|
|
96
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
97
|
+
payload: dict[str, Any] = {
|
|
98
|
+
"model": self.model,
|
|
99
|
+
"temperature": self.temperature,
|
|
100
|
+
"messages": messages,
|
|
101
|
+
}
|
|
102
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
103
|
+
response = await client.post(
|
|
104
|
+
f"{self.base_url}/chat/completions",
|
|
105
|
+
json=payload,
|
|
106
|
+
headers=headers,
|
|
107
|
+
)
|
|
108
|
+
if response.status_code >= 400:
|
|
109
|
+
msg = (
|
|
110
|
+
f"judge endpoint returned HTTP {response.status_code}: "
|
|
111
|
+
f"{response.text[:200]}"
|
|
112
|
+
)
|
|
113
|
+
raise IndexBuildError(
|
|
114
|
+
msg,
|
|
115
|
+
details={
|
|
116
|
+
"status": response.status_code,
|
|
117
|
+
"model": self.model,
|
|
118
|
+
"base_url": self.base_url,
|
|
119
|
+
},
|
|
120
|
+
)
|
|
121
|
+
data = response.json()
|
|
122
|
+
try:
|
|
123
|
+
return str(data["choices"][0]["message"]["content"]).strip()
|
|
124
|
+
except (KeyError, IndexError, TypeError) as exc:
|
|
125
|
+
msg = "judge response did not match OpenAI chat-completions shape"
|
|
126
|
+
raise IndexBuildError(msg, details={"response": data}) from exc
|
cairn/bench/metrics.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Recall@k computation for bench questions.
|
|
2
|
+
|
|
3
|
+
An ``expected_anchor`` is considered matched if it appears as a substring
|
|
4
|
+
of any retrieved section id within the top ``k`` results. This is a
|
|
5
|
+
deliberate convenience: authoring full hierarchical slugs in YAML/TOML is
|
|
6
|
+
brittle, so we accept short suffixes that uniquely identify a section.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def recall_at_k(
|
|
15
|
+
retrieved_section_ids: Sequence[str],
|
|
16
|
+
expected_anchors: Sequence[str],
|
|
17
|
+
*,
|
|
18
|
+
k: int,
|
|
19
|
+
) -> float:
|
|
20
|
+
"""Return the fraction of expected anchors found in the top-k retrieval.
|
|
21
|
+
|
|
22
|
+
Returns ``1.0`` when ``expected_anchors`` is empty (vacuously true).
|
|
23
|
+
"""
|
|
24
|
+
if not expected_anchors:
|
|
25
|
+
return 1.0
|
|
26
|
+
|
|
27
|
+
top_k = retrieved_section_ids[:k]
|
|
28
|
+
matched = 0
|
|
29
|
+
for expected in expected_anchors:
|
|
30
|
+
if any(expected in retrieved for retrieved in top_k):
|
|
31
|
+
matched += 1
|
|
32
|
+
return matched / len(expected_anchors)
|
cairn/bench/report.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Bench result types + JSON / markdown report writers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import statistics
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
System = Literal["cairn", "naive"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SystemResult(BaseModel):
|
|
18
|
+
"""One system's result for one question."""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
21
|
+
|
|
22
|
+
system: System
|
|
23
|
+
section_ids: tuple[str, ...]
|
|
24
|
+
recall_at_k: float = Field(ge=0.0, le=1.0)
|
|
25
|
+
tokens_returned: int = Field(ge=0)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class QuestionResult(BaseModel):
|
|
29
|
+
"""A question's results across both systems."""
|
|
30
|
+
|
|
31
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
32
|
+
|
|
33
|
+
document_id: str
|
|
34
|
+
question_id: str
|
|
35
|
+
question: str
|
|
36
|
+
expected_anchors: tuple[str, ...]
|
|
37
|
+
tags: tuple[str, ...]
|
|
38
|
+
cairn: SystemResult
|
|
39
|
+
naive: SystemResult
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BenchSummary(BaseModel):
|
|
43
|
+
"""Full bench output."""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
46
|
+
|
|
47
|
+
suite_name: str
|
|
48
|
+
k: int
|
|
49
|
+
questions: tuple[QuestionResult, ...]
|
|
50
|
+
|
|
51
|
+
def cairn_mean_recall(self) -> float:
|
|
52
|
+
return _mean(q.cairn.recall_at_k for q in self.questions)
|
|
53
|
+
|
|
54
|
+
def naive_mean_recall(self) -> float:
|
|
55
|
+
return _mean(q.naive.recall_at_k for q in self.questions)
|
|
56
|
+
|
|
57
|
+
def cairn_mean_tokens(self) -> float:
|
|
58
|
+
return _mean(float(q.cairn.tokens_returned) for q in self.questions)
|
|
59
|
+
|
|
60
|
+
def naive_mean_tokens(self) -> float:
|
|
61
|
+
return _mean(float(q.naive.tokens_returned) for q in self.questions)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _mean(values: Iterable[float]) -> float:
|
|
65
|
+
materialized = list(values)
|
|
66
|
+
return statistics.mean(materialized) if materialized else 0.0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Report writers
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def write_json_report(summary: BenchSummary, path: Path) -> Path:
|
|
75
|
+
"""Write a deterministic JSON report next to the bench's working dir."""
|
|
76
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
payload = {
|
|
78
|
+
"generated_at": datetime.now(UTC).isoformat(),
|
|
79
|
+
"suite": summary.suite_name,
|
|
80
|
+
"k": summary.k,
|
|
81
|
+
"summary": {
|
|
82
|
+
"cairn_mean_recall_at_k": summary.cairn_mean_recall(),
|
|
83
|
+
"naive_mean_recall_at_k": summary.naive_mean_recall(),
|
|
84
|
+
"cairn_mean_tokens": summary.cairn_mean_tokens(),
|
|
85
|
+
"naive_mean_tokens": summary.naive_mean_tokens(),
|
|
86
|
+
},
|
|
87
|
+
"questions": [q.model_dump(mode="json") for q in summary.questions],
|
|
88
|
+
}
|
|
89
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
90
|
+
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
|
91
|
+
fh.write("\n")
|
|
92
|
+
return path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def format_markdown_report(summary: BenchSummary) -> str:
|
|
96
|
+
"""Render a human-readable comparison table."""
|
|
97
|
+
n = len(summary.questions)
|
|
98
|
+
cairn_recall = summary.cairn_mean_recall()
|
|
99
|
+
naive_recall = summary.naive_mean_recall()
|
|
100
|
+
cairn_tokens = summary.cairn_mean_tokens()
|
|
101
|
+
naive_tokens = summary.naive_mean_tokens()
|
|
102
|
+
|
|
103
|
+
token_ratio = (
|
|
104
|
+
cairn_tokens / naive_tokens if naive_tokens > 0 else float("nan")
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
lines: list[str] = []
|
|
108
|
+
lines.append(f"# {summary.suite_name}")
|
|
109
|
+
lines.append("")
|
|
110
|
+
lines.append(f"Questions: **{n}** · k = **{summary.k}**")
|
|
111
|
+
lines.append("")
|
|
112
|
+
lines.append("## Headline")
|
|
113
|
+
lines.append("")
|
|
114
|
+
lines.append("| metric | naive vector RAG | Cairn |")
|
|
115
|
+
lines.append("|---|---:|---:|")
|
|
116
|
+
lines.append(
|
|
117
|
+
f"| mean recall@{summary.k} | {naive_recall:.2%} | **{cairn_recall:.2%}** |"
|
|
118
|
+
)
|
|
119
|
+
lines.append(
|
|
120
|
+
f"| mean tokens returned | {naive_tokens:,.0f} | **{cairn_tokens:,.0f}** "
|
|
121
|
+
f"({token_ratio:.1%} of naive) |"
|
|
122
|
+
)
|
|
123
|
+
lines.append("")
|
|
124
|
+
|
|
125
|
+
lines.append("## Per-question")
|
|
126
|
+
lines.append("")
|
|
127
|
+
lines.append("| question | recall (cairn) | recall (naive) | tokens (cairn / naive) |")
|
|
128
|
+
lines.append("|---|---:|---:|---:|")
|
|
129
|
+
for q in summary.questions:
|
|
130
|
+
lines.append(
|
|
131
|
+
f"| `{q.question_id}` {_truncate(q.question, 60)} "
|
|
132
|
+
f"| {q.cairn.recall_at_k:.0%} "
|
|
133
|
+
f"| {q.naive.recall_at_k:.0%} "
|
|
134
|
+
f"| {q.cairn.tokens_returned} / {q.naive.tokens_returned} |"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return "\n".join(lines) + "\n"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _truncate(text: str, width: int) -> str:
|
|
141
|
+
if len(text) <= width:
|
|
142
|
+
return text
|
|
143
|
+
return text[: width - 1] + "…"
|