docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/bench/runner.py ADDED
@@ -0,0 +1,219 @@
1
+ """Bench orchestrator — runs a suite end-to-end against Cairn and the baseline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from cairn.bench.baseline import NaiveHit, NaiveRAG
11
+ from cairn.bench.dataset import BenchDocument, BenchQuestion, BenchSuite
12
+ from cairn.bench.judge import LLMJudge
13
+ from cairn.bench.metrics import recall_at_k
14
+ from cairn.bench.report import BenchSummary, QuestionResult, SystemResult
15
+ from cairn.embed.base import Embedder
16
+ from cairn.engine.indexer import Indexer
17
+ from cairn.entity.heuristic import HeuristicExtractor
18
+ from cairn.ingest.markdown import MarkdownParser
19
+ from cairn.summarize.base import Summarizer
20
+ from cairn.tools.base import DocumentIndex, estimate_tokens
21
+ from cairn.tools.search_semantic import search_semantic
22
+ from cairn.xref.heuristic import HeuristicXRefExtractor
23
+
24
+
25
+ class BenchOptions(BaseModel):
26
+ model_config = ConfigDict(frozen=True, extra="forbid")
27
+
28
+ k: int = Field(default=8, ge=1, le=32)
29
+ naive_chunk_size_words: int = Field(default=512, ge=1)
30
+ summary_concurrency: int = Field(default=4, ge=1)
31
+ embed_batch_size: int = Field(default=32, ge=1)
32
+
33
+
34
+ class BenchRunner:
35
+ """Runs a :class:`BenchSuite` against Cairn and a naive baseline."""
36
+
37
+ def __init__(
38
+ self,
39
+ *,
40
+ summarizer: Summarizer,
41
+ embedder: Embedder,
42
+ judge: LLMJudge | None = None,
43
+ options: BenchOptions | None = None,
44
+ progress: Callable[[str], None] | None = None,
45
+ ) -> None:
46
+ self.summarizer = summarizer
47
+ self.embedder = embedder
48
+ self.judge = judge
49
+ self.options = options or BenchOptions()
50
+ self.progress = progress
51
+
52
+ async def run(self, suite: BenchSuite, *, work_dir: Path) -> BenchSummary:
53
+ work_dir.mkdir(parents=True, exist_ok=True)
54
+ results: list[QuestionResult] = []
55
+ for document in suite.documents:
56
+ self._emit(f"document {document.id}: starting")
57
+ results.extend(await self._run_document(document, work_dir / document.id))
58
+ self._emit(f"document {document.id}: done")
59
+ return BenchSummary(
60
+ suite_name=suite.name,
61
+ k=self.options.k,
62
+ questions=tuple(results),
63
+ )
64
+
65
+ async def _run_document(
66
+ self,
67
+ document: BenchDocument,
68
+ doc_dir: Path,
69
+ ) -> list[QuestionResult]:
70
+ doc_dir.mkdir(parents=True, exist_ok=True)
71
+ source_text = document.source.read_text(encoding="utf-8")
72
+
73
+ cairn_dir = doc_dir / "cairn"
74
+ naive_dir = doc_dir / "naive"
75
+ cairn_dir.mkdir(parents=True, exist_ok=True)
76
+ naive_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ parser = MarkdownParser()
79
+ parsed = parser.parse(document.source, doc_id=document.id)
80
+
81
+ indexer = Indexer(
82
+ parser=parser,
83
+ summarizer=self.summarizer,
84
+ embedder=self.embedder,
85
+ entity_extractor=HeuristicExtractor(),
86
+ xref_extractor=HeuristicXRefExtractor(),
87
+ summary_concurrency=self.options.summary_concurrency,
88
+ embed_batch_size=self.options.embed_batch_size,
89
+ progress=lambda message: self._emit(f"cairn index: {message}"),
90
+ )
91
+ self._emit("cairn index: starting")
92
+ await indexer.index_document(parsed, out_dir=cairn_dir)
93
+ self._emit("cairn index: loaded")
94
+ cairn_index = DocumentIndex.load(cairn_dir)
95
+
96
+ naive = NaiveRAG(
97
+ self.embedder,
98
+ chunk_size_words=self.options.naive_chunk_size_words,
99
+ batch_size=self.options.embed_batch_size,
100
+ )
101
+ self._emit("naive index: starting")
102
+ await naive.index(parsed, source_text, out_dir=naive_dir)
103
+ self._emit("naive index: done")
104
+
105
+ results: list[QuestionResult] = []
106
+ total_questions = len(document.questions)
107
+ for question_no, question in enumerate(document.questions, start=1):
108
+ self._emit(
109
+ f"question {question_no}/{total_questions} {question.id}: retrieving"
110
+ )
111
+ cairn_result, cairn_context = await self._run_cairn(
112
+ cairn_index, question
113
+ )
114
+ naive_result, naive_context = await self._run_naive(
115
+ naive, question, naive_dir
116
+ )
117
+
118
+ if self.judge is not None and question.reference is not None:
119
+ cairn_result = await self._judge_result(
120
+ cairn_result, question, cairn_context
121
+ )
122
+ naive_result = await self._judge_result(
123
+ naive_result, question, naive_context
124
+ )
125
+
126
+ results.append(
127
+ QuestionResult(
128
+ document_id=document.id,
129
+ question_id=question.id,
130
+ question=question.question,
131
+ expected_anchors=question.expected_anchors,
132
+ tags=question.tags,
133
+ cairn=cairn_result,
134
+ naive=naive_result,
135
+ )
136
+ )
137
+ return results
138
+
139
+ def _emit(self, message: str) -> None:
140
+ if self.progress is not None:
141
+ self.progress(message)
142
+
143
+ async def _judge_result(
144
+ self,
145
+ result: SystemResult,
146
+ question: BenchQuestion,
147
+ context: str,
148
+ ) -> SystemResult:
149
+ if self.judge is None or question.reference is None:
150
+ return result
151
+ answer = await self.judge.answer(question.question, context)
152
+ is_correct, _ = await self.judge.judge(
153
+ question.question, question.reference, answer
154
+ )
155
+ return result.model_copy(
156
+ update={"qa_correct": is_correct, "qa_answer": answer}
157
+ )
158
+
159
+ async def _run_cairn(
160
+ self,
161
+ index: DocumentIndex,
162
+ question: BenchQuestion,
163
+ ) -> tuple[SystemResult, str]:
164
+ response = await search_semantic(
165
+ index,
166
+ embedder=self.embedder,
167
+ query=question.question,
168
+ k=self.options.k,
169
+ )
170
+ section_ids = [hit["id"] for hit in response.data["hits"]]
171
+ recall = recall_at_k(
172
+ section_ids, question.expected_anchors, k=self.options.k
173
+ )
174
+ context = _format_cairn_context(response.data["hits"])
175
+ result = SystemResult(
176
+ system="cairn",
177
+ section_ids=tuple(section_ids),
178
+ recall_at_k=recall,
179
+ tokens_returned=response.tokens_returned,
180
+ )
181
+ return result, context
182
+
183
+ async def _run_naive(
184
+ self,
185
+ naive: NaiveRAG,
186
+ question: BenchQuestion,
187
+ naive_dir: Path,
188
+ ) -> tuple[SystemResult, str]:
189
+ hits = await naive.retrieve(
190
+ question.question, out_dir=naive_dir, k=self.options.k
191
+ )
192
+ section_ids = [hit.section_id or "" for hit in hits]
193
+ recall = recall_at_k(
194
+ section_ids, question.expected_anchors, k=self.options.k
195
+ )
196
+ tokens = sum(estimate_tokens(hit.text) for hit in hits)
197
+ context = _format_naive_context(hits)
198
+ result = SystemResult(
199
+ system="naive",
200
+ section_ids=tuple(section_ids),
201
+ recall_at_k=recall,
202
+ tokens_returned=tokens,
203
+ )
204
+ return result, context
205
+
206
+
207
+ def _format_cairn_context(hits: list[dict[str, object]]) -> str:
208
+ parts: list[str] = []
209
+ for hit in hits:
210
+ title = str(hit.get("title", ""))
211
+ synopsis = str(hit.get("synopsis", ""))
212
+ head = str(hit.get("head", ""))
213
+ body = synopsis or head
214
+ parts.append(f"## {title}\n\n{body}".strip())
215
+ return "\n\n---\n\n".join(parts)
216
+
217
+
218
+ def _format_naive_context(hits: list[NaiveHit]) -> str:
219
+ return "\n\n---\n\n".join(hit.text for hit in hits)
cairn/cli/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Cairn CLI — typer-based entry point."""
2
+
3
+ from cairn.cli.app import app
4
+
5
+ __all__ = ["app"]