athenaeum-kb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- athenaeum/__init__.py +28 -0
- athenaeum/athenaeum.py +286 -0
- athenaeum/chunker.py +77 -0
- athenaeum/config.py +16 -0
- athenaeum/document_store.py +53 -0
- athenaeum/models.py +88 -0
- athenaeum/ocr/__init__.py +48 -0
- athenaeum/ocr/base.py +28 -0
- athenaeum/ocr/custom.py +26 -0
- athenaeum/ocr/docling.py +30 -0
- athenaeum/ocr/lighton.py +40 -0
- athenaeum/ocr/markitdown.py +29 -0
- athenaeum/ocr/mistral.py +44 -0
- athenaeum/search/__init__.py +7 -0
- athenaeum/search/bm25.py +72 -0
- athenaeum/search/hybrid.py +37 -0
- athenaeum/search/vector.py +72 -0
- athenaeum/storage.py +68 -0
- athenaeum/toc.py +49 -0
- athenaeum_kb-0.1.0.dist-info/METADATA +165 -0
- athenaeum_kb-0.1.0.dist-info/RECORD +22 -0
- athenaeum_kb-0.1.0.dist-info/WHEEL +4 -0
athenaeum/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Athenaeum - Tools for intelligent interaction with knowledge bases."""
|
|
2
|
+
|
|
3
|
+
from athenaeum.athenaeum import Athenaeum
|
|
4
|
+
from athenaeum.config import AthenaeumConfig
|
|
5
|
+
from athenaeum.models import (
|
|
6
|
+
ChunkMetadata,
|
|
7
|
+
ContentSearchHit,
|
|
8
|
+
Document,
|
|
9
|
+
Excerpt,
|
|
10
|
+
Metadata,
|
|
11
|
+
SearchHit,
|
|
12
|
+
TOCEntry,
|
|
13
|
+
)
|
|
14
|
+
from athenaeum.ocr import OCRProvider, get_ocr_provider
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Athenaeum",
|
|
18
|
+
"AthenaeumConfig",
|
|
19
|
+
"ChunkMetadata",
|
|
20
|
+
"ContentSearchHit",
|
|
21
|
+
"Document",
|
|
22
|
+
"Excerpt",
|
|
23
|
+
"Metadata",
|
|
24
|
+
"OCRProvider",
|
|
25
|
+
"SearchHit",
|
|
26
|
+
"TOCEntry",
|
|
27
|
+
"get_ocr_provider",
|
|
28
|
+
]
|
athenaeum/athenaeum.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Main orchestrator class for Athenaeum."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
import uuid
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
from langchain_core.embeddings import Embeddings
|
|
11
|
+
|
|
12
|
+
from athenaeum.chunker import chunk_markdown
|
|
13
|
+
from athenaeum.config import AthenaeumConfig
|
|
14
|
+
from athenaeum.document_store import DocumentStore
|
|
15
|
+
from athenaeum.models import ChunkMetadata, ContentSearchHit, Document, Excerpt, SearchHit
|
|
16
|
+
from athenaeum.ocr import OCRProvider, get_ocr_provider
|
|
17
|
+
from athenaeum.search.bm25 import BM25Index
|
|
18
|
+
from athenaeum.search.hybrid import reciprocal_rank_fusion
|
|
19
|
+
from athenaeum.search.vector import VectorIndex
|
|
20
|
+
from athenaeum.storage import StorageManager
|
|
21
|
+
from athenaeum.toc import extract_toc
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Athenaeum:
|
|
25
|
+
"""Main entry point for the Athenaeum knowledge base."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
embeddings: Embeddings,
|
|
30
|
+
config: AthenaeumConfig | None = None,
|
|
31
|
+
ocr_provider: OCRProvider | None = None,
|
|
32
|
+
) -> None:
|
|
33
|
+
self._config = config or AthenaeumConfig()
|
|
34
|
+
self._storage = StorageManager(self._config.storage_dir)
|
|
35
|
+
self._doc_store = DocumentStore(self._storage)
|
|
36
|
+
self._ocr = ocr_provider or get_ocr_provider("markitdown")
|
|
37
|
+
self._bm25 = BM25Index()
|
|
38
|
+
self._vector = VectorIndex(
|
|
39
|
+
embeddings=embeddings,
|
|
40
|
+
persist_directory=self._storage.ensure_chroma_dir(),
|
|
41
|
+
collection_name="athenaeum",
|
|
42
|
+
)
|
|
43
|
+
self._reindex_bm25()
|
|
44
|
+
|
|
45
|
+
def _reindex_bm25(self) -> None:
|
|
46
|
+
"""Rebuild BM25 index from all stored documents."""
|
|
47
|
+
for doc in self._doc_store.list_all():
|
|
48
|
+
md_path = Path(doc.path_to_md)
|
|
49
|
+
if md_path.exists():
|
|
50
|
+
text = md_path.read_text()
|
|
51
|
+
chunks = chunk_markdown(
|
|
52
|
+
text, doc.id, self._config.chunk_size, self._config.chunk_overlap
|
|
53
|
+
)
|
|
54
|
+
self._bm25.add_chunks(chunks)
|
|
55
|
+
|
|
56
|
+
def load_doc(self, path: str) -> str:
|
|
57
|
+
"""Load a document into the knowledge base.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
path: Path to the document file.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The document ID.
|
|
64
|
+
"""
|
|
65
|
+
file_path = Path(path).resolve()
|
|
66
|
+
if not file_path.exists():
|
|
67
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
68
|
+
|
|
69
|
+
ext = file_path.suffix.lower()
|
|
70
|
+
supported = self._ocr.supported_extensions()
|
|
71
|
+
if ".*" not in supported and ext not in supported:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Unsupported file type: {ext}. Supported: {sorted(supported)}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
doc_id = uuid.uuid4().hex[:12]
|
|
77
|
+
|
|
78
|
+
# Copy raw file
|
|
79
|
+
raw_dest = self._storage.raw_path(doc_id, ext)
|
|
80
|
+
shutil.copy2(file_path, raw_dest)
|
|
81
|
+
|
|
82
|
+
# Convert to markdown
|
|
83
|
+
markdown = self._ocr.convert(file_path)
|
|
84
|
+
md_dest = self._storage.content_md_path(doc_id)
|
|
85
|
+
md_dest.write_text(markdown)
|
|
86
|
+
|
|
87
|
+
# Extract metadata
|
|
88
|
+
toc = extract_toc(markdown)
|
|
89
|
+
lines = markdown.split("\n")
|
|
90
|
+
|
|
91
|
+
doc = Document(
|
|
92
|
+
id=doc_id,
|
|
93
|
+
name=file_path.name,
|
|
94
|
+
path_to_raw=str(raw_dest),
|
|
95
|
+
path_to_md=str(md_dest),
|
|
96
|
+
num_lines=len(lines),
|
|
97
|
+
table_of_contents=toc,
|
|
98
|
+
file_size=file_path.stat().st_size,
|
|
99
|
+
file_type=ext,
|
|
100
|
+
)
|
|
101
|
+
self._doc_store.add(doc)
|
|
102
|
+
|
|
103
|
+
# Index
|
|
104
|
+
chunks = chunk_markdown(
|
|
105
|
+
markdown, doc_id, self._config.chunk_size, self._config.chunk_overlap
|
|
106
|
+
)
|
|
107
|
+
self._bm25.add_chunks(chunks)
|
|
108
|
+
self._vector.add_chunks(chunks)
|
|
109
|
+
|
|
110
|
+
return doc_id
|
|
111
|
+
|
|
112
|
+
def list_docs(self) -> list[SearchHit]:
|
|
113
|
+
"""List all documents in the knowledge base."""
|
|
114
|
+
results = []
|
|
115
|
+
for doc in self._doc_store.list_all():
|
|
116
|
+
results.append(
|
|
117
|
+
SearchHit(
|
|
118
|
+
id=doc.id,
|
|
119
|
+
name=doc.name,
|
|
120
|
+
num_lines=doc.num_lines,
|
|
121
|
+
table_of_contents=doc.format_toc(),
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
return results
|
|
125
|
+
|
|
126
|
+
def search_docs(
|
|
127
|
+
self,
|
|
128
|
+
query: str,
|
|
129
|
+
top_k: int = 10,
|
|
130
|
+
scope: Literal["names", "contents"] = "contents",
|
|
131
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
132
|
+
) -> list[SearchHit]:
|
|
133
|
+
"""Search across all documents.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
query: Search query text.
|
|
137
|
+
top_k: Maximum number of results.
|
|
138
|
+
scope: ``"contents"`` to search within documents, ``"names"`` to search names only.
|
|
139
|
+
strategy: Search strategy (only for ``scope="contents"``).
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Ranked list of matching documents.
|
|
143
|
+
"""
|
|
144
|
+
if scope == "names":
|
|
145
|
+
return self._search_by_name(query, top_k)
|
|
146
|
+
|
|
147
|
+
chunks = self._search_chunks(query, top_k=top_k * 3, strategy=strategy)
|
|
148
|
+
|
|
149
|
+
# Aggregate chunks by document
|
|
150
|
+
doc_scores: dict[str, float] = {}
|
|
151
|
+
doc_snippets: dict[str, str] = {}
|
|
152
|
+
for chunk, score in chunks:
|
|
153
|
+
if chunk.doc_id not in doc_scores or score > doc_scores[chunk.doc_id]:
|
|
154
|
+
doc_scores[chunk.doc_id] = score
|
|
155
|
+
doc_snippets[chunk.doc_id] = chunk.text[:200]
|
|
156
|
+
|
|
157
|
+
results = []
|
|
158
|
+
for doc_id, score in sorted(doc_scores.items(), key=lambda x: x[1], reverse=True):
|
|
159
|
+
doc = self._doc_store.get(doc_id)
|
|
160
|
+
if doc is None:
|
|
161
|
+
continue
|
|
162
|
+
results.append(
|
|
163
|
+
SearchHit(
|
|
164
|
+
id=doc.id,
|
|
165
|
+
name=doc.name,
|
|
166
|
+
num_lines=doc.num_lines,
|
|
167
|
+
table_of_contents=doc.format_toc(),
|
|
168
|
+
score=score,
|
|
169
|
+
snippet=doc_snippets.get(doc_id, ""),
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return results[:top_k]
|
|
174
|
+
|
|
175
|
+
def search_doc_contents(
|
|
176
|
+
self,
|
|
177
|
+
doc_id: str,
|
|
178
|
+
query: str,
|
|
179
|
+
top_k: int = 5,
|
|
180
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
181
|
+
) -> list[ContentSearchHit]:
|
|
182
|
+
"""Search within a specific document.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
doc_id: Document identifier.
|
|
186
|
+
query: Search query text.
|
|
187
|
+
top_k: Maximum number of results.
|
|
188
|
+
strategy: Search strategy.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
List of matching content fragments.
|
|
192
|
+
"""
|
|
193
|
+
doc = self._doc_store.get(doc_id)
|
|
194
|
+
if doc is None:
|
|
195
|
+
raise ValueError(f"Document not found: {doc_id}")
|
|
196
|
+
|
|
197
|
+
chunks = self._search_chunks(query, top_k=top_k, strategy=strategy, doc_id=doc_id)
|
|
198
|
+
|
|
199
|
+
return [
|
|
200
|
+
ContentSearchHit(
|
|
201
|
+
doc_id=chunk.doc_id,
|
|
202
|
+
line_range=(chunk.start_line, chunk.end_line),
|
|
203
|
+
text=chunk.text,
|
|
204
|
+
score=score,
|
|
205
|
+
)
|
|
206
|
+
for chunk, score in chunks
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
def read_doc(
|
|
210
|
+
self,
|
|
211
|
+
doc_id: str,
|
|
212
|
+
start_line: int = 1,
|
|
213
|
+
end_line: int = 100,
|
|
214
|
+
) -> Excerpt:
|
|
215
|
+
"""Read a range of lines from a document.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
doc_id: Document identifier.
|
|
219
|
+
start_line: Starting line number (1-indexed).
|
|
220
|
+
end_line: Ending line number (1-indexed, inclusive).
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
An ``Excerpt`` with the requested lines.
|
|
224
|
+
"""
|
|
225
|
+
doc = self._doc_store.get(doc_id)
|
|
226
|
+
if doc is None:
|
|
227
|
+
raise ValueError(f"Document not found: {doc_id}")
|
|
228
|
+
|
|
229
|
+
md_path = Path(doc.path_to_md)
|
|
230
|
+
lines = md_path.read_text().split("\n")
|
|
231
|
+
|
|
232
|
+
start_idx = max(0, start_line - 1)
|
|
233
|
+
end_idx = min(len(lines), end_line)
|
|
234
|
+
selected = lines[start_idx:end_idx]
|
|
235
|
+
|
|
236
|
+
return Excerpt(
|
|
237
|
+
doc_id=doc_id,
|
|
238
|
+
line_range=(start_idx + 1, end_idx),
|
|
239
|
+
text="\n".join(selected),
|
|
240
|
+
total_lines=len(lines),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def _search_by_name(self, query: str, top_k: int) -> list[SearchHit]:
|
|
244
|
+
query_lower = query.lower()
|
|
245
|
+
scored = []
|
|
246
|
+
for doc in self._doc_store.list_all():
|
|
247
|
+
name_lower = doc.name.lower()
|
|
248
|
+
if query_lower in name_lower:
|
|
249
|
+
# Simple relevance: exact match > contains
|
|
250
|
+
score = 1.0 if query_lower == name_lower else 0.5
|
|
251
|
+
scored.append((doc, score))
|
|
252
|
+
|
|
253
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
254
|
+
return [
|
|
255
|
+
SearchHit(
|
|
256
|
+
id=doc.id,
|
|
257
|
+
name=doc.name,
|
|
258
|
+
num_lines=doc.num_lines,
|
|
259
|
+
table_of_contents=doc.format_toc(),
|
|
260
|
+
score=score,
|
|
261
|
+
)
|
|
262
|
+
for doc, score in scored[:top_k]
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
def _search_chunks(
|
|
266
|
+
self,
|
|
267
|
+
query: str,
|
|
268
|
+
top_k: int,
|
|
269
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
270
|
+
doc_id: str | None = None,
|
|
271
|
+
) -> list[tuple[ChunkMetadata, float]]:
|
|
272
|
+
"""Internal: run search using the given strategy."""
|
|
273
|
+
if strategy == "bm25":
|
|
274
|
+
return self._bm25.search(query, top_k=top_k, doc_id=doc_id)
|
|
275
|
+
|
|
276
|
+
if strategy == "vector":
|
|
277
|
+
return self._vector.search(query, top_k=top_k, doc_id=doc_id)
|
|
278
|
+
|
|
279
|
+
# hybrid
|
|
280
|
+
bm25_results = self._bm25.search(query, top_k=top_k, doc_id=doc_id)
|
|
281
|
+
vector_results = self._vector.search(query, top_k=top_k, doc_id=doc_id)
|
|
282
|
+
return reciprocal_rank_fusion(
|
|
283
|
+
[bm25_results, vector_results],
|
|
284
|
+
k=self._config.rrf_k,
|
|
285
|
+
top_k=top_k,
|
|
286
|
+
)
|
athenaeum/chunker.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Line-aware markdown chunking with heading-boundary snapping."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from athenaeum.models import ChunkMetadata
|
|
8
|
+
|
|
9
|
+
_HEADING_RE = re.compile(r"^#{1,6}\s+")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _find_heading_lines(lines: list[str]) -> set[int]:
|
|
13
|
+
"""Return 0-indexed line numbers that are markdown headings."""
|
|
14
|
+
return {i for i, line in enumerate(lines) if _HEADING_RE.match(line.strip())}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def chunk_markdown(
|
|
18
|
+
markdown: str,
|
|
19
|
+
doc_id: str,
|
|
20
|
+
chunk_size: int = 80,
|
|
21
|
+
chunk_overlap: int = 20,
|
|
22
|
+
) -> list[ChunkMetadata]:
|
|
23
|
+
"""Split markdown into overlapping, line-based chunks.
|
|
24
|
+
|
|
25
|
+
Chunks are snapped to heading boundaries when a heading falls within the
|
|
26
|
+
overlap region, so sections start cleanly.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
markdown: Full markdown text.
|
|
30
|
+
doc_id: Parent document ID.
|
|
31
|
+
chunk_size: Target number of lines per chunk.
|
|
32
|
+
chunk_overlap: Number of overlapping lines between consecutive chunks.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of ``ChunkMetadata`` instances.
|
|
36
|
+
"""
|
|
37
|
+
if not markdown:
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
lines = markdown.split("\n")
|
|
41
|
+
total = len(lines)
|
|
42
|
+
|
|
43
|
+
heading_lines = _find_heading_lines(lines)
|
|
44
|
+
chunks: list[ChunkMetadata] = []
|
|
45
|
+
start = 0
|
|
46
|
+
chunk_index = 0
|
|
47
|
+
|
|
48
|
+
while start < total:
|
|
49
|
+
end = min(start + chunk_size, total)
|
|
50
|
+
text = "\n".join(lines[start:end])
|
|
51
|
+
|
|
52
|
+
chunks.append(
|
|
53
|
+
ChunkMetadata(
|
|
54
|
+
doc_id=doc_id,
|
|
55
|
+
chunk_index=chunk_index,
|
|
56
|
+
start_line=start + 1, # 1-indexed
|
|
57
|
+
end_line=end, # 1-indexed inclusive
|
|
58
|
+
text=text,
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
chunk_index += 1
|
|
62
|
+
|
|
63
|
+
if end >= total:
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# Compute next start with overlap
|
|
67
|
+
next_start = end - chunk_overlap
|
|
68
|
+
|
|
69
|
+
# Snap to heading if one exists in the overlap zone [next_start, end)
|
|
70
|
+
for line_idx in range(next_start, end):
|
|
71
|
+
if line_idx in heading_lines:
|
|
72
|
+
next_start = line_idx
|
|
73
|
+
break
|
|
74
|
+
|
|
75
|
+
start = next_start
|
|
76
|
+
|
|
77
|
+
return chunks
|
athenaeum/config.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Configuration for Athenaeum."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class AthenaeumConfig:
|
|
10
|
+
"""Configuration for an Athenaeum instance."""
|
|
11
|
+
|
|
12
|
+
storage_dir: Path = field(default_factory=lambda: Path.home() / ".athenaeum")
|
|
13
|
+
chunk_size: int = 80
|
|
14
|
+
chunk_overlap: int = 20
|
|
15
|
+
rrf_k: int = 60
|
|
16
|
+
default_strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""JSON-backed document registry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from athenaeum.models import Document
|
|
6
|
+
from athenaeum.storage import StorageManager
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DocumentStore:
|
|
10
|
+
"""Manages the document registry backed by ``metadata.json``."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, storage: StorageManager) -> None:
|
|
13
|
+
self._storage = storage
|
|
14
|
+
self._docs: dict[str, Document] = {}
|
|
15
|
+
self._load()
|
|
16
|
+
|
|
17
|
+
def _load(self) -> None:
|
|
18
|
+
raw = self._storage.load_metadata()
|
|
19
|
+
docs_raw = raw.get("documents", {})
|
|
20
|
+
for doc_id, data in docs_raw.items():
|
|
21
|
+
self._docs[doc_id] = Document.model_validate(data)
|
|
22
|
+
|
|
23
|
+
def _save(self) -> None:
|
|
24
|
+
data = {
|
|
25
|
+
"documents": {
|
|
26
|
+
doc_id: doc.model_dump(mode="json") for doc_id, doc in self._docs.items()
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
self._storage.save_metadata(data)
|
|
30
|
+
|
|
31
|
+
def add(self, doc: Document) -> None:
|
|
32
|
+
"""Add or update a document in the registry."""
|
|
33
|
+
self._docs[doc.id] = doc
|
|
34
|
+
self._save()
|
|
35
|
+
|
|
36
|
+
def get(self, doc_id: str) -> Document | None:
|
|
37
|
+
"""Get a document by ID, or None if not found."""
|
|
38
|
+
return self._docs.get(doc_id)
|
|
39
|
+
|
|
40
|
+
def list_all(self) -> list[Document]:
|
|
41
|
+
"""Return all documents."""
|
|
42
|
+
return list(self._docs.values())
|
|
43
|
+
|
|
44
|
+
def remove(self, doc_id: str) -> Document | None:
|
|
45
|
+
"""Remove a document from the registry. Returns the removed doc or None."""
|
|
46
|
+
doc = self._docs.pop(doc_id, None)
|
|
47
|
+
if doc is not None:
|
|
48
|
+
self._save()
|
|
49
|
+
return doc
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def count(self) -> int:
|
|
53
|
+
return len(self._docs)
|
athenaeum/models.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Pydantic data models for Athenaeum."""
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Metadata(BaseModel):
|
|
9
|
+
"""Basic document metadata."""
|
|
10
|
+
|
|
11
|
+
id: str = Field(..., description="Unique document identifier")
|
|
12
|
+
name: str = Field(..., description="Document display name")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TOCEntry(BaseModel):
|
|
16
|
+
"""Table of contents entry with line range."""
|
|
17
|
+
|
|
18
|
+
title: str = Field(..., description="Section title")
|
|
19
|
+
level: int = Field(..., description="Header level (1 for h1, 2 for h2)")
|
|
20
|
+
start_line: int = Field(..., description="Starting line number (1-indexed)")
|
|
21
|
+
end_line: int | None = Field(None, description="Ending line number (1-indexed, inclusive)")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Document(BaseModel):
|
|
25
|
+
"""Full document record stored in the knowledge base."""
|
|
26
|
+
|
|
27
|
+
id: str = Field(..., description="Unique document identifier (UUID)")
|
|
28
|
+
name: str = Field(..., description="Original filename")
|
|
29
|
+
path_to_raw: str = Field(..., description="Path to original file")
|
|
30
|
+
path_to_md: str = Field(..., description="Path to converted markdown")
|
|
31
|
+
num_lines: int = Field(..., description="Total number of lines in markdown")
|
|
32
|
+
table_of_contents: list[TOCEntry] = Field(
|
|
33
|
+
default_factory=list, description="Parsed table of contents"
|
|
34
|
+
)
|
|
35
|
+
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
36
|
+
file_size: int = Field(0, description="Original file size in bytes")
|
|
37
|
+
file_type: str = Field("", description="Original file extension")
|
|
38
|
+
|
|
39
|
+
def format_toc(self) -> str:
|
|
40
|
+
"""Format table of contents as a readable string."""
|
|
41
|
+
if not self.table_of_contents:
|
|
42
|
+
return "No table of contents available"
|
|
43
|
+
|
|
44
|
+
lines = []
|
|
45
|
+
for entry in self.table_of_contents:
|
|
46
|
+
indent = " " * (entry.level - 1)
|
|
47
|
+
line_info = f"[lines {entry.start_line}-{entry.end_line or '?'}]"
|
|
48
|
+
lines.append(f"{indent}- {entry.title} {line_info}")
|
|
49
|
+
return "\n".join(lines)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SearchHit(BaseModel):
|
|
53
|
+
"""Search result for document-level search."""
|
|
54
|
+
|
|
55
|
+
id: str = Field(..., description="Document identifier")
|
|
56
|
+
name: str = Field(..., description="Document name")
|
|
57
|
+
num_lines: int = Field(..., description="Total lines in document")
|
|
58
|
+
table_of_contents: str = Field(..., description="Formatted table of contents")
|
|
59
|
+
score: float = Field(default=0.0, description="Search relevance score")
|
|
60
|
+
snippet: str = Field(default="", description="Relevant text snippet")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Excerpt(BaseModel):
|
|
64
|
+
"""Text excerpt from a document."""
|
|
65
|
+
|
|
66
|
+
doc_id: str = Field(..., description="Document identifier")
|
|
67
|
+
line_range: tuple[int, int] = Field(..., description="Line range (start, end), 1-indexed")
|
|
68
|
+
text: str = Field(..., description="Extracted text content")
|
|
69
|
+
total_lines: int = Field(0, description="Total lines in document")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ContentSearchHit(BaseModel):
|
|
73
|
+
"""Search result for within-document content search."""
|
|
74
|
+
|
|
75
|
+
doc_id: str = Field(..., description="Document identifier")
|
|
76
|
+
line_range: tuple[int, int] = Field(..., description="Line range of the match")
|
|
77
|
+
text: str = Field(..., description="Matching text content")
|
|
78
|
+
score: float = Field(0.0, description="Search relevance score")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ChunkMetadata(BaseModel):
|
|
82
|
+
"""Metadata stored with each chunk in the vector store."""
|
|
83
|
+
|
|
84
|
+
doc_id: str = Field(..., description="Parent document ID")
|
|
85
|
+
chunk_index: int = Field(..., description="Chunk index within document")
|
|
86
|
+
start_line: int = Field(..., description="Starting line number")
|
|
87
|
+
end_line: int = Field(..., description="Ending line number")
|
|
88
|
+
text: str = Field(..., description="Chunk text content")
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""OCR provider registry and factory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from athenaeum.ocr.base import OCRProvider
|
|
8
|
+
from athenaeum.ocr.custom import CustomOCR
|
|
9
|
+
from athenaeum.ocr.markitdown import MarkitdownOCR
|
|
10
|
+
|
|
11
|
+
OCRBackend = Literal["markitdown", "docling", "mistral", "lighton"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_ocr_provider(backend: OCRBackend = "markitdown", **kwargs: object) -> OCRProvider:
|
|
15
|
+
"""Factory to create an OCR provider by name.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
backend: One of ``"markitdown"``, ``"docling"``, ``"mistral"``, ``"lighton"``.
|
|
19
|
+
**kwargs: Passed to the provider constructor.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
An ``OCRProvider`` instance.
|
|
23
|
+
"""
|
|
24
|
+
if backend == "markitdown":
|
|
25
|
+
return MarkitdownOCR()
|
|
26
|
+
if backend == "docling":
|
|
27
|
+
from athenaeum.ocr.docling import DoclingOCR
|
|
28
|
+
|
|
29
|
+
return DoclingOCR()
|
|
30
|
+
if backend == "mistral":
|
|
31
|
+
from athenaeum.ocr.mistral import MistralOCR
|
|
32
|
+
|
|
33
|
+
return MistralOCR(**kwargs) # type: ignore[arg-type]
|
|
34
|
+
if backend == "lighton":
|
|
35
|
+
from athenaeum.ocr.lighton import LightOnOCR
|
|
36
|
+
|
|
37
|
+
return LightOnOCR(**kwargs) # type: ignore[arg-type]
|
|
38
|
+
|
|
39
|
+
raise ValueError(f"Unknown OCR backend: {backend!r}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"CustomOCR",
|
|
44
|
+
"MarkitdownOCR",
|
|
45
|
+
"OCRBackend",
|
|
46
|
+
"OCRProvider",
|
|
47
|
+
"get_ocr_provider",
|
|
48
|
+
]
|
athenaeum/ocr/base.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Abstract base class for OCR providers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OCRProvider(ABC):
|
|
10
|
+
"""Base class for OCR/document-to-markdown converters."""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def convert(self, file_path: Path) -> str:
|
|
14
|
+
"""Convert a file to markdown text.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
file_path: Path to the source file.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Markdown string of the file contents.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def supported_extensions(self) -> set[str]:
|
|
25
|
+
"""Return the set of file extensions this provider supports.
|
|
26
|
+
|
|
27
|
+
Extensions should include the leading dot, e.g. ``{".pdf", ".docx"}``.
|
|
28
|
+
"""
|
athenaeum/ocr/custom.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Custom OCR provider wrapping a user-supplied callable."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from athenaeum.ocr.base import OCRProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CustomOCR(OCRProvider):
|
|
12
|
+
"""Wrap an arbitrary ``(Path) -> str`` callable as an OCR provider."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
fn: Callable[[Path], str],
|
|
17
|
+
extensions: set[str] | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
self._fn = fn
|
|
20
|
+
self._extensions = extensions or {".*"}
|
|
21
|
+
|
|
22
|
+
def convert(self, file_path: Path) -> str:
|
|
23
|
+
return self._fn(file_path)
|
|
24
|
+
|
|
25
|
+
def supported_extensions(self) -> set[str]:
|
|
26
|
+
return self._extensions
|
athenaeum/ocr/docling.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""OCR provider using Docling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from athenaeum.ocr.base import OCRProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DoclingOCR(OCRProvider):
|
|
11
|
+
"""Convert documents to markdown using Docling."""
|
|
12
|
+
|
|
13
|
+
_EXTENSIONS = {".pdf", ".pptx", ".docx", ".xlsx", ".html", ".md"}
|
|
14
|
+
|
|
15
|
+
def __init__(self) -> None:
|
|
16
|
+
try:
|
|
17
|
+
from docling.document_converter import DocumentConverter
|
|
18
|
+
except ImportError as e:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"Docling is not installed. Install with: pip install 'athenaeum-kb[docling]'"
|
|
21
|
+
) from e
|
|
22
|
+
self._converter = DocumentConverter()
|
|
23
|
+
|
|
24
|
+
def convert(self, file_path: Path) -> str:
|
|
25
|
+
result = self._converter.convert(str(file_path))
|
|
26
|
+
md: str = result.document.export_to_markdown()
|
|
27
|
+
return md
|
|
28
|
+
|
|
29
|
+
def supported_extensions(self) -> set[str]:
|
|
30
|
+
return self._EXTENSIONS
|
athenaeum/ocr/lighton.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""OCR provider using LightOnOCR-2-1B."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from athenaeum.ocr.base import OCRProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LightOnOCR(OCRProvider):
|
|
11
|
+
"""Convert documents to markdown using LightOnOCR-2-1B (local model)."""
|
|
12
|
+
|
|
13
|
+
_EXTENSIONS = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
|
|
14
|
+
|
|
15
|
+
def __init__(self, device: str = "cpu") -> None:
|
|
16
|
+
try:
|
|
17
|
+
from transformers import pipeline
|
|
18
|
+
except ImportError as e:
|
|
19
|
+
raise ImportError(
|
|
20
|
+
"transformers/torch not installed. "
|
|
21
|
+
"Install with: pip install 'athenaeum-kb[lighton]'"
|
|
22
|
+
) from e
|
|
23
|
+
self._pipe = pipeline(
|
|
24
|
+
"image-text-to-text",
|
|
25
|
+
model="lightonai/LightOnOCR-2-1B",
|
|
26
|
+
device=device,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def convert(self, file_path: Path) -> str:
|
|
30
|
+
from PIL import Image
|
|
31
|
+
|
|
32
|
+
image = Image.open(file_path)
|
|
33
|
+
result = self._pipe(image)
|
|
34
|
+
if isinstance(result, list) and len(result) > 0:
|
|
35
|
+
text: str = result[0].get("generated_text", "")
|
|
36
|
+
return text
|
|
37
|
+
return str(result)
|
|
38
|
+
|
|
39
|
+
def supported_extensions(self) -> set[str]:
|
|
40
|
+
return self._EXTENSIONS
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""OCR provider using Microsoft markitdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from markitdown import MarkItDown
|
|
8
|
+
|
|
9
|
+
from athenaeum.ocr.base import OCRProvider
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MarkitdownOCR(OCRProvider):
|
|
13
|
+
"""Convert documents to markdown using markitdown."""
|
|
14
|
+
|
|
15
|
+
_EXTENSIONS = {
|
|
16
|
+
".pdf", ".pptx", ".docx", ".xlsx",
|
|
17
|
+
".json", ".csv", ".txt", ".md",
|
|
18
|
+
".html", ".xml", ".rtf", ".epub",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self._converter = MarkItDown()
|
|
23
|
+
|
|
24
|
+
def convert(self, file_path: Path) -> str:
|
|
25
|
+
result = self._converter.convert(str(file_path))
|
|
26
|
+
return result.text_content
|
|
27
|
+
|
|
28
|
+
def supported_extensions(self) -> set[str]:
|
|
29
|
+
return self._EXTENSIONS
|
athenaeum/ocr/mistral.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""OCR provider using Mistral OCR API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from athenaeum.ocr.base import OCRProvider
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MistralOCR(OCRProvider):
|
|
13
|
+
"""Convert documents to markdown using Mistral's OCR API."""
|
|
14
|
+
|
|
15
|
+
_EXTENSIONS = {".pdf"}
|
|
16
|
+
|
|
17
|
+
def __init__(self, api_key: str | None = None) -> None:
|
|
18
|
+
try:
|
|
19
|
+
from mistralai import Mistral
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"Mistral SDK is not installed. Install with: pip install 'athenaeum-kb[mistral]'"
|
|
23
|
+
) from e
|
|
24
|
+
key = api_key or os.environ.get("MISTRAL_API_KEY", "")
|
|
25
|
+
if not key:
|
|
26
|
+
raise ValueError("MISTRAL_API_KEY must be set or passed explicitly.")
|
|
27
|
+
self._client = Mistral(api_key=key)
|
|
28
|
+
|
|
29
|
+
def convert(self, file_path: Path) -> str:
|
|
30
|
+
from mistralai import DocumentURLChunk, OCRRequest
|
|
31
|
+
|
|
32
|
+
encoded = base64.standard_b64encode(file_path.read_bytes()).decode()
|
|
33
|
+
data_uri = f"data:application/pdf;base64,{encoded}"
|
|
34
|
+
response = self._client.ocr.process(
|
|
35
|
+
request=OCRRequest(
|
|
36
|
+
document=DocumentURLChunk(document_url=data_uri),
|
|
37
|
+
model="mistral-ocr-latest",
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
pages = [page.markdown for page in response.pages]
|
|
41
|
+
return "\n\n".join(pages)
|
|
42
|
+
|
|
43
|
+
def supported_extensions(self) -> set[str]:
|
|
44
|
+
return self._EXTENSIONS
|
athenaeum/search/bm25.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""BM25 keyword search index."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from rank_bm25 import BM25Okapi
|
|
8
|
+
|
|
9
|
+
from athenaeum.models import ChunkMetadata
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class _Entry:
|
|
14
|
+
chunk: ChunkMetadata
|
|
15
|
+
tokens: list[str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BM25Index:
|
|
19
|
+
"""In-memory BM25 index over document chunks."""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self._entries: list[_Entry] = []
|
|
23
|
+
self._bm25: BM25Okapi | None = None
|
|
24
|
+
|
|
25
|
+
def _rebuild(self) -> None:
|
|
26
|
+
if self._entries:
|
|
27
|
+
corpus = [e.tokens for e in self._entries]
|
|
28
|
+
self._bm25 = BM25Okapi(corpus)
|
|
29
|
+
else:
|
|
30
|
+
self._bm25 = None
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _tokenize(text: str) -> list[str]:
|
|
34
|
+
return text.lower().split()
|
|
35
|
+
|
|
36
|
+
def add_chunks(self, chunks: list[ChunkMetadata]) -> None:
|
|
37
|
+
"""Add chunks to the index and rebuild."""
|
|
38
|
+
for chunk in chunks:
|
|
39
|
+
self._entries.append(_Entry(chunk=chunk, tokens=self._tokenize(chunk.text)))
|
|
40
|
+
self._rebuild()
|
|
41
|
+
|
|
42
|
+
def remove_document(self, doc_id: str) -> None:
|
|
43
|
+
"""Remove all chunks for a document and rebuild."""
|
|
44
|
+
self._entries = [e for e in self._entries if e.chunk.doc_id != doc_id]
|
|
45
|
+
self._rebuild()
|
|
46
|
+
|
|
47
|
+
def search(
|
|
48
|
+
self,
|
|
49
|
+
query: str,
|
|
50
|
+
top_k: int = 10,
|
|
51
|
+
doc_id: str | None = None,
|
|
52
|
+
) -> list[tuple[ChunkMetadata, float]]:
|
|
53
|
+
"""Search the index, returning (chunk, score) pairs sorted by score descending."""
|
|
54
|
+
if not self._bm25 or not self._entries:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
tokens = self._tokenize(query)
|
|
58
|
+
scores = self._bm25.get_scores(tokens)
|
|
59
|
+
|
|
60
|
+
results: list[tuple[ChunkMetadata, float]] = []
|
|
61
|
+
for i, score in enumerate(scores):
|
|
62
|
+
entry = self._entries[i]
|
|
63
|
+
if doc_id is not None and entry.chunk.doc_id != doc_id:
|
|
64
|
+
continue
|
|
65
|
+
results.append((entry.chunk, float(score)))
|
|
66
|
+
|
|
67
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
68
|
+
return results[:top_k]
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def size(self) -> int:
|
|
72
|
+
return len(self._entries)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Reciprocal Rank Fusion for combining ranked search results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from athenaeum.models import ChunkMetadata
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def reciprocal_rank_fusion(
|
|
9
|
+
ranked_lists: list[list[tuple[ChunkMetadata, float]]],
|
|
10
|
+
k: int = 60,
|
|
11
|
+
top_k: int = 10,
|
|
12
|
+
) -> list[tuple[ChunkMetadata, float]]:
|
|
13
|
+
"""Combine multiple ranked lists using Reciprocal Rank Fusion.
|
|
14
|
+
|
|
15
|
+
RRF score for a chunk = sum over lists of 1 / (k + rank), where rank is
|
|
16
|
+
1-indexed. Higher scores indicate better combined relevance.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
ranked_lists: List of ranked result lists, each containing (chunk, score) pairs.
|
|
20
|
+
k: RRF constant (default 60).
|
|
21
|
+
top_k: Number of results to return.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Merged and re-ranked list of (chunk, rrf_score) pairs.
|
|
25
|
+
"""
|
|
26
|
+
scores: dict[str, float] = {}
|
|
27
|
+
chunks: dict[str, ChunkMetadata] = {}
|
|
28
|
+
|
|
29
|
+
for ranked_list in ranked_lists:
|
|
30
|
+
for rank, (chunk, _score) in enumerate(ranked_list, start=1):
|
|
31
|
+
key = f"{chunk.doc_id}:{chunk.chunk_index}"
|
|
32
|
+
scores[key] = scores.get(key, 0.0) + 1.0 / (k + rank)
|
|
33
|
+
chunks[key] = chunk
|
|
34
|
+
|
|
35
|
+
merged = [(chunks[key], score) for key, score in scores.items()]
|
|
36
|
+
merged.sort(key=lambda x: x[1], reverse=True)
|
|
37
|
+
return merged[:top_k]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Vector similarity search index using LangChain + Chroma."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from langchain_chroma import Chroma
|
|
8
|
+
from langchain_core.embeddings import Embeddings
|
|
9
|
+
|
|
10
|
+
from athenaeum.models import ChunkMetadata
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class VectorIndex:
|
|
14
|
+
"""Vector similarity search backed by Chroma."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
embeddings: Embeddings,
|
|
19
|
+
persist_directory: str | Path | None = None,
|
|
20
|
+
collection_name: str = "athenaeum",
|
|
21
|
+
) -> None:
|
|
22
|
+
kwargs: dict[str, object] = {
|
|
23
|
+
"embedding_function": embeddings,
|
|
24
|
+
"collection_name": collection_name,
|
|
25
|
+
}
|
|
26
|
+
if persist_directory is not None:
|
|
27
|
+
kwargs["persist_directory"] = str(persist_directory)
|
|
28
|
+
self._store = Chroma(**kwargs) # type: ignore[arg-type]
|
|
29
|
+
self._embeddings = embeddings
|
|
30
|
+
|
|
31
|
+
def add_chunks(self, chunks: list[ChunkMetadata]) -> None:
|
|
32
|
+
"""Add chunks to the vector store."""
|
|
33
|
+
if not chunks:
|
|
34
|
+
return
|
|
35
|
+
texts = [c.text for c in chunks]
|
|
36
|
+
metadatas = [c.model_dump() for c in chunks]
|
|
37
|
+
ids = [f"{c.doc_id}:{c.chunk_index}" for c in chunks]
|
|
38
|
+
self._store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
|
39
|
+
|
|
40
|
+
def remove_document(self, doc_id: str) -> None:
|
|
41
|
+
"""Remove all chunks for a document from the vector store."""
|
|
42
|
+
self._store._collection.delete(where={"doc_id": doc_id})
|
|
43
|
+
|
|
44
|
+
def search(
|
|
45
|
+
self,
|
|
46
|
+
query: str,
|
|
47
|
+
top_k: int = 10,
|
|
48
|
+
doc_id: str | None = None,
|
|
49
|
+
) -> list[tuple[ChunkMetadata, float]]:
|
|
50
|
+
"""Search for similar chunks, returning (chunk, score) pairs.
|
|
51
|
+
|
|
52
|
+
Scores are similarity scores (higher = more similar).
|
|
53
|
+
"""
|
|
54
|
+
kwargs: dict[str, object] = {"k": top_k}
|
|
55
|
+
if doc_id is not None:
|
|
56
|
+
kwargs["filter"] = {"doc_id": doc_id}
|
|
57
|
+
|
|
58
|
+
results = self._store.similarity_search_with_relevance_scores(query, **kwargs) # type: ignore[arg-type]
|
|
59
|
+
|
|
60
|
+
output: list[tuple[ChunkMetadata, float]] = []
|
|
61
|
+
for doc, score in results:
|
|
62
|
+
meta = doc.metadata
|
|
63
|
+
chunk = ChunkMetadata(
|
|
64
|
+
doc_id=meta["doc_id"],
|
|
65
|
+
chunk_index=meta["chunk_index"],
|
|
66
|
+
start_line=meta["start_line"],
|
|
67
|
+
end_line=meta["end_line"],
|
|
68
|
+
text=meta["text"],
|
|
69
|
+
)
|
|
70
|
+
output.append((chunk, float(score)))
|
|
71
|
+
|
|
72
|
+
return output
|
athenaeum/storage.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Storage layout manager for Athenaeum."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StorageManager:
|
|
12
|
+
"""Manages the on-disk layout under the storage root.
|
|
13
|
+
|
|
14
|
+
Layout::
|
|
15
|
+
|
|
16
|
+
<root>/
|
|
17
|
+
docs/<doc_id>/raw.* # original file
|
|
18
|
+
docs/<doc_id>/content.md # converted markdown
|
|
19
|
+
index/chroma/ # Chroma persistent directory
|
|
20
|
+
metadata.json # document registry
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, root: Path) -> None:
|
|
24
|
+
self.root = root
|
|
25
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def docs_dir(self) -> Path:
|
|
29
|
+
return self.root / "docs"
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def chroma_dir(self) -> Path:
|
|
33
|
+
return self.root / "index" / "chroma"
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def metadata_path(self) -> Path:
|
|
37
|
+
return self.root / "metadata.json"
|
|
38
|
+
|
|
39
|
+
def doc_dir(self, doc_id: str) -> Path:
|
|
40
|
+
d = self.docs_dir / doc_id
|
|
41
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
return d
|
|
43
|
+
|
|
44
|
+
def raw_path(self, doc_id: str, suffix: str) -> Path:
|
|
45
|
+
"""Return path for storing the original file."""
|
|
46
|
+
return self.doc_dir(doc_id) / f"raw{suffix}"
|
|
47
|
+
|
|
48
|
+
def content_md_path(self, doc_id: str) -> Path:
|
|
49
|
+
"""Return path for the converted markdown."""
|
|
50
|
+
return self.doc_dir(doc_id) / "content.md"
|
|
51
|
+
|
|
52
|
+
def remove_doc(self, doc_id: str) -> None:
|
|
53
|
+
"""Remove a document's directory."""
|
|
54
|
+
d = self.docs_dir / doc_id
|
|
55
|
+
if d.exists():
|
|
56
|
+
shutil.rmtree(d)
|
|
57
|
+
|
|
58
|
+
def ensure_chroma_dir(self) -> Path:
|
|
59
|
+
self.chroma_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
return self.chroma_dir
|
|
61
|
+
|
|
62
|
+
def load_metadata(self) -> dict[str, Any]:
|
|
63
|
+
if self.metadata_path.exists():
|
|
64
|
+
return json.loads(self.metadata_path.read_text()) # type: ignore[no-any-return]
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
def save_metadata(self, data: dict[str, Any]) -> None:
|
|
68
|
+
self.metadata_path.write_text(json.dumps(data, indent=2, default=str))
|
athenaeum/toc.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Table of contents extraction from markdown headings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from athenaeum.models import TOCEntry
|
|
8
|
+
|
|
9
|
+
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_toc(markdown: str) -> list[TOCEntry]:
|
|
13
|
+
"""Extract a table of contents from markdown heading lines.
|
|
14
|
+
|
|
15
|
+
Each entry records the heading level, title, and the line range it spans.
|
|
16
|
+
Line numbers are 1-indexed. The ``end_line`` of each entry is set to the
|
|
17
|
+
line before the next heading at the same or higher level, or to the last
|
|
18
|
+
line of the document for the final entry.
|
|
19
|
+
"""
|
|
20
|
+
lines = markdown.split("\n")
|
|
21
|
+
entries: list[TOCEntry] = []
|
|
22
|
+
|
|
23
|
+
for line_no_0, line in enumerate(lines):
|
|
24
|
+
m = _HEADING_RE.match(line.strip())
|
|
25
|
+
if m:
|
|
26
|
+
level = len(m.group(1))
|
|
27
|
+
title = m.group(2).strip()
|
|
28
|
+
entries.append(
|
|
29
|
+
TOCEntry(title=title, level=level, start_line=line_no_0 + 1, end_line=None)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Fill in end_line for each entry
|
|
33
|
+
total_lines = len(lines)
|
|
34
|
+
for i, entry in enumerate(entries):
|
|
35
|
+
# Find next entry at same or higher (lower number) level
|
|
36
|
+
end = total_lines
|
|
37
|
+
for j in range(i + 1, len(entries)):
|
|
38
|
+
if entries[j].level <= entry.level:
|
|
39
|
+
end = entries[j].start_line - 1
|
|
40
|
+
break
|
|
41
|
+
else:
|
|
42
|
+
# Last entry at this level — ends at document end
|
|
43
|
+
if i + 1 < len(entries):
|
|
44
|
+
end = entries[-1].start_line - 1
|
|
45
|
+
# Actually, just go to end of document
|
|
46
|
+
end = total_lines
|
|
47
|
+
entry.end_line = end
|
|
48
|
+
|
|
49
|
+
return entries
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: athenaeum-kb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tools for intelligent interaction with knowledge bases
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: langchain-chroma>=0.2
|
|
8
|
+
Requires-Dist: langchain-core>=0.3
|
|
9
|
+
Requires-Dist: langchain-openai>=0.3
|
|
10
|
+
Requires-Dist: markitdown>=0.1
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
13
|
+
Provides-Extra: all-ocr
|
|
14
|
+
Requires-Dist: docling>=2.0; extra == 'all-ocr'
|
|
15
|
+
Requires-Dist: mistralai>=1.0; extra == 'all-ocr'
|
|
16
|
+
Requires-Dist: pillow>=10.0; extra == 'all-ocr'
|
|
17
|
+
Requires-Dist: torch>=2.0; extra == 'all-ocr'
|
|
18
|
+
Requires-Dist: transformers>=4.40; extra == 'all-ocr'
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Provides-Extra: docling
|
|
25
|
+
Requires-Dist: docling>=2.0; extra == 'docling'
|
|
26
|
+
Provides-Extra: lighton
|
|
27
|
+
Requires-Dist: pillow>=10.0; extra == 'lighton'
|
|
28
|
+
Requires-Dist: torch>=2.0; extra == 'lighton'
|
|
29
|
+
Requires-Dist: transformers>=4.40; extra == 'lighton'
|
|
30
|
+
Provides-Extra: mistral
|
|
31
|
+
Requires-Dist: mistralai>=1.0; extra == 'mistral'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Athenaeum
|
|
35
|
+
|
|
36
|
+
## Project Scope and Goals
|
|
37
|
+
|
|
38
|
+
The goal of this project is to build a Python library that equips AI agents with a robust set of tools for intelligent interaction with knowledge bases. The library focuses on document ingestion, semantic search, and structured access to content, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
|
|
39
|
+
|
|
40
|
+
Once the module is fully tested and validated, the intended outcome is to package and publish it as a reusable Python library on PyPI.
|
|
41
|
+
|
|
42
|
+
## Tools
|
|
43
|
+
|
|
44
|
+
### `load_doc`
|
|
45
|
+
Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
load_doc(path: str) -> str
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Parameters:**
|
|
52
|
+
- `path`: Path to the document file
|
|
53
|
+
|
|
54
|
+
**Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
|
|
55
|
+
|
|
56
|
+
**Returns**: A document identifier (doc_id) that can be used for subsequent operations.
|
|
57
|
+
|
|
58
|
+
### `list_docs`
|
|
59
|
+
List all documents currently stored in the knowledge base.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
list_docs() -> list[DocSearchHit]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Returns:** A list of documents, including metadata (id, name, format, etc.) and, when available, a table of contents.
|
|
66
|
+
|
|
67
|
+
### `search_docs`
|
|
68
|
+
|
|
69
|
+
Search across all documents in the knowledge base.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
search_docs(
|
|
73
|
+
query: str,
|
|
74
|
+
top_k: int = 10,
|
|
75
|
+
scope: Literal["names", "contents"] = "contents",
|
|
76
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
|
|
77
|
+
) -> list[DocSearchHit]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Parameters:**
|
|
81
|
+
- `query`: Search query text
|
|
82
|
+
- `top_k`: Maximum number of results (default: 10)
|
|
83
|
+
- `scope`: Where to search
|
|
84
|
+
- `"contents"`: Search within document contents (default)
|
|
85
|
+
- `"names"`: Search only document names
|
|
86
|
+
- `strategy`: Search strategy (only applies when scope is "contents")
|
|
87
|
+
- `"hybrid"`: Combines vector and BM25 search (default)
|
|
88
|
+
- `"bm25"`: Keyword-based search only
|
|
89
|
+
- `"vector"`: Semantic similarity search only
|
|
90
|
+
|
|
91
|
+
**Returns**: A ranked list of documents matching the query.
|
|
92
|
+
|
|
93
|
+
### `search_doc_contents`
|
|
94
|
+
|
|
95
|
+
Search within a specific document.
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
search_doc_contents(
|
|
99
|
+
doc_id: str,
|
|
100
|
+
query: str,
|
|
101
|
+
top_k: int = 5,
|
|
102
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
|
|
103
|
+
) -> list[ContentSearchHit]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Parameters:**
|
|
107
|
+
- `doc_id`: Document identifier
|
|
108
|
+
- `query`: Search query text
|
|
109
|
+
- `top_k`: Maximum number of results (default: 5)
|
|
110
|
+
- `strategy`: Search strategy
|
|
111
|
+
- `"hybrid"`: Combines vector and BM25 search (default)
|
|
112
|
+
- `"bm25"`: Keyword-based search only
|
|
113
|
+
- `"vector"`: Semantic similarity search only
|
|
114
|
+
|
|
115
|
+
**Returns**: A list of matching content fragments with relevance scores.
|
|
116
|
+
|
|
117
|
+
### `read_doc`
|
|
118
|
+
|
|
119
|
+
Read a specific range of lines from a document.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
read_doc(
|
|
123
|
+
doc_id: str,
|
|
124
|
+
start_line: int = 1,
|
|
125
|
+
end_line: int = 100
|
|
126
|
+
) -> Excerpt
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Parameters:**
|
|
130
|
+
- `doc_id`: Document identifier
|
|
131
|
+
- `start_line`: Starting line number (1-indexed, default: 1)
|
|
132
|
+
- `end_line`: Ending line number (1-indexed, inclusive, default: 100)
|
|
133
|
+
|
|
134
|
+
**Returns**: A document excerpt containing the requested lines.
|
|
135
|
+
|
|
136
|
+
## Search Strategies
|
|
137
|
+
|
|
138
|
+
- **Hybrid Search** (Default): Combines vector similarity search with BM25 keyword search using Reciprocal Rank Fusion (RRF).
|
|
139
|
+
- **Vector Search**: Uses embedding models for semantic similarity search.
|
|
140
|
+
- **BM25 Search**: Traditional keyword-based search using the BM25 algorithm.
|
|
141
|
+
|
|
142
|
+
## Document Ingestion Workflow
|
|
143
|
+
|
|
144
|
+
The ingestion pipeline is triggered by the load_doc(path) function and follows these steps:
|
|
145
|
+
1. Validation
|
|
146
|
+
- Verify that the file exists.
|
|
147
|
+
- Confirm that the file format is supported.
|
|
148
|
+
2. Content Extraction
|
|
149
|
+
- Run an OCR or parsing pipeline to convert the raw file into Markdown.
|
|
150
|
+
- If the document contains images:
|
|
151
|
+
- Replace them with placeholders in the Markdown.
|
|
152
|
+
- Store image references for later retrieval.
|
|
153
|
+
3. Pre-processing
|
|
154
|
+
- Generate document metadata (e.g., id, name, format).
|
|
155
|
+
- Build a table of contents (TOC) from Markdown headings when possible.
|
|
156
|
+
4. Indexing
|
|
157
|
+
- Generate vector embeddings using the configured embedding model.
|
|
158
|
+
- Store embeddings in the vector database for semantic retrieval.
|
|
159
|
+
|
|
160
|
+
## Data Models
|
|
161
|
+
A preliminary set of domain models is defined in `models.py`. These classes are exploratory and serve as a conceptual starting point for the project.
|
|
162
|
+
|
|
163
|
+
They are not considered final and may be refactored, renamed, or removed as the overall architecture of Athenaeum evolves and solidifies.
|
|
164
|
+
|
|
165
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
athenaeum/__init__.py,sha256=3oXIneyMISBVERMs4KuJOJ7MrjZj5I7lC19wQaTuzvA,585
|
|
2
|
+
athenaeum/athenaeum.py,sha256=kl-DZnm5qPyY-oVEcJIgZnprobWOumKPziSyz5YlaE8,9330
|
|
3
|
+
athenaeum/chunker.py,sha256=Sk2h5Z3oVhK6BPdXRMWVMONurwIsMgRkRIorbEPl_Po,2050
|
|
4
|
+
athenaeum/config.py,sha256=NcQa5t3sj0XacqmJgWLE88Ci9uHafWy6Z988WE36rqg,441
|
|
5
|
+
athenaeum/document_store.py,sha256=8llflG_k0UFjtr2Xb4YWnDDpFzFvfA_OQZ7WELyP6Qo,1611
|
|
6
|
+
athenaeum/models.py,sha256=ZeVQkPtS5WBPWkdSTYa-Z0ZZ9jtPdqDwnH16803gu54,3647
|
|
7
|
+
athenaeum/storage.py,sha256=4kV-wPtJ-MPS1ApPlr8W66bLYmtYGknnta4-lFHLXuo,2028
|
|
8
|
+
athenaeum/toc.py,sha256=qROrkhUg7JOlLpgHwuM2-JZd8eX2WO4iMsZh0E68zwY,1639
|
|
9
|
+
athenaeum/ocr/__init__.py,sha256=aG2NgzHL8TZXlfw3HDPZI8-t6gNJSTV1iLODIOSLEFE,1304
|
|
10
|
+
athenaeum/ocr/base.py,sha256=qdw--LAUPIloFWuKGaGLD7lPzNttcVL0HOjAJbZBqQk,716
|
|
11
|
+
athenaeum/ocr/custom.py,sha256=pddvXL-3aAR4Fo3nNDxfjVHO0D6xhTmpI6rLPDgazCU,672
|
|
12
|
+
athenaeum/ocr/docling.py,sha256=sPRu6ONiC6L2ZHi7RLYqFHFX6oLjRMsfsSR__9ssJeo,898
|
|
13
|
+
athenaeum/ocr/lighton.py,sha256=oArJ4BvbxByFTYu4HaokKXLngvQdX7_RJIZUlPe6WBc,1197
|
|
14
|
+
athenaeum/ocr/markitdown.py,sha256=M6bCoqsIvm_z_tLQeTq5ie-CaFihPspu0wZUIudrNZ8,727
|
|
15
|
+
athenaeum/ocr/mistral.py,sha256=XMGk1yLhwbdi_glzdsjLdOptbMwmc-86Fate3iSTwUI,1436
|
|
16
|
+
athenaeum/search/__init__.py,sha256=BCmd-4lIZmxVQZTC3GNt5-uz0pI8G9wd35MYmmIWdIA,256
|
|
17
|
+
athenaeum/search/bm25.py,sha256=b-SbtFaXwqt2IzLaytwwSVo8LBm_v0UotJeEeIsEgl8,2069
|
|
18
|
+
athenaeum/search/hybrid.py,sha256=a9kFNq6WBnNZtC6Rocx9ZI5BTe-f53KR8beaVC12cQU,1273
|
|
19
|
+
athenaeum/search/vector.py,sha256=Hu_7gbwiRLYlkpZGUnf0SD9LXWHZ7IxIUEls1w4blFo,2428
|
|
20
|
+
athenaeum_kb-0.1.0.dist-info/METADATA,sha256=5_5bjWHZ3XJLs5Da85lMEgDuNe1IeLB7CmgYFcI1IAo,5549
|
|
21
|
+
athenaeum_kb-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
22
|
+
athenaeum_kb-0.1.0.dist-info/RECORD,,
|