athenaeum-kb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
athenaeum/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """Athenaeum - Tools for intelligent interaction with knowledge bases."""
2
+
3
+ from athenaeum.athenaeum import Athenaeum
4
+ from athenaeum.config import AthenaeumConfig
5
+ from athenaeum.models import (
6
+ ChunkMetadata,
7
+ ContentSearchHit,
8
+ Document,
9
+ Excerpt,
10
+ Metadata,
11
+ SearchHit,
12
+ TOCEntry,
13
+ )
14
+ from athenaeum.ocr import OCRProvider, get_ocr_provider
15
+
16
+ __all__ = [
17
+ "Athenaeum",
18
+ "AthenaeumConfig",
19
+ "ChunkMetadata",
20
+ "ContentSearchHit",
21
+ "Document",
22
+ "Excerpt",
23
+ "Metadata",
24
+ "OCRProvider",
25
+ "SearchHit",
26
+ "TOCEntry",
27
+ "get_ocr_provider",
28
+ ]
athenaeum/athenaeum.py ADDED
@@ -0,0 +1,286 @@
1
+ """Main orchestrator class for Athenaeum."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from langchain_core.embeddings import Embeddings
11
+
12
+ from athenaeum.chunker import chunk_markdown
13
+ from athenaeum.config import AthenaeumConfig
14
+ from athenaeum.document_store import DocumentStore
15
+ from athenaeum.models import ChunkMetadata, ContentSearchHit, Document, Excerpt, SearchHit
16
+ from athenaeum.ocr import OCRProvider, get_ocr_provider
17
+ from athenaeum.search.bm25 import BM25Index
18
+ from athenaeum.search.hybrid import reciprocal_rank_fusion
19
+ from athenaeum.search.vector import VectorIndex
20
+ from athenaeum.storage import StorageManager
21
+ from athenaeum.toc import extract_toc
22
+
23
+
24
+ class Athenaeum:
25
+ """Main entry point for the Athenaeum knowledge base."""
26
+
27
+ def __init__(
28
+ self,
29
+ embeddings: Embeddings,
30
+ config: AthenaeumConfig | None = None,
31
+ ocr_provider: OCRProvider | None = None,
32
+ ) -> None:
33
+ self._config = config or AthenaeumConfig()
34
+ self._storage = StorageManager(self._config.storage_dir)
35
+ self._doc_store = DocumentStore(self._storage)
36
+ self._ocr = ocr_provider or get_ocr_provider("markitdown")
37
+ self._bm25 = BM25Index()
38
+ self._vector = VectorIndex(
39
+ embeddings=embeddings,
40
+ persist_directory=self._storage.ensure_chroma_dir(),
41
+ collection_name="athenaeum",
42
+ )
43
+ self._reindex_bm25()
44
+
45
+ def _reindex_bm25(self) -> None:
46
+ """Rebuild BM25 index from all stored documents."""
47
+ for doc in self._doc_store.list_all():
48
+ md_path = Path(doc.path_to_md)
49
+ if md_path.exists():
50
+ text = md_path.read_text()
51
+ chunks = chunk_markdown(
52
+ text, doc.id, self._config.chunk_size, self._config.chunk_overlap
53
+ )
54
+ self._bm25.add_chunks(chunks)
55
+
56
+ def load_doc(self, path: str) -> str:
57
+ """Load a document into the knowledge base.
58
+
59
+ Args:
60
+ path: Path to the document file.
61
+
62
+ Returns:
63
+ The document ID.
64
+ """
65
+ file_path = Path(path).resolve()
66
+ if not file_path.exists():
67
+ raise FileNotFoundError(f"File not found: {file_path}")
68
+
69
+ ext = file_path.suffix.lower()
70
+ supported = self._ocr.supported_extensions()
71
+ if ".*" not in supported and ext not in supported:
72
+ raise ValueError(
73
+ f"Unsupported file type: {ext}. Supported: {sorted(supported)}"
74
+ )
75
+
76
+ doc_id = uuid.uuid4().hex[:12]
77
+
78
+ # Copy raw file
79
+ raw_dest = self._storage.raw_path(doc_id, ext)
80
+ shutil.copy2(file_path, raw_dest)
81
+
82
+ # Convert to markdown
83
+ markdown = self._ocr.convert(file_path)
84
+ md_dest = self._storage.content_md_path(doc_id)
85
+ md_dest.write_text(markdown)
86
+
87
+ # Extract metadata
88
+ toc = extract_toc(markdown)
89
+ lines = markdown.split("\n")
90
+
91
+ doc = Document(
92
+ id=doc_id,
93
+ name=file_path.name,
94
+ path_to_raw=str(raw_dest),
95
+ path_to_md=str(md_dest),
96
+ num_lines=len(lines),
97
+ table_of_contents=toc,
98
+ file_size=file_path.stat().st_size,
99
+ file_type=ext,
100
+ )
101
+ self._doc_store.add(doc)
102
+
103
+ # Index
104
+ chunks = chunk_markdown(
105
+ markdown, doc_id, self._config.chunk_size, self._config.chunk_overlap
106
+ )
107
+ self._bm25.add_chunks(chunks)
108
+ self._vector.add_chunks(chunks)
109
+
110
+ return doc_id
111
+
112
+ def list_docs(self) -> list[SearchHit]:
113
+ """List all documents in the knowledge base."""
114
+ results = []
115
+ for doc in self._doc_store.list_all():
116
+ results.append(
117
+ SearchHit(
118
+ id=doc.id,
119
+ name=doc.name,
120
+ num_lines=doc.num_lines,
121
+ table_of_contents=doc.format_toc(),
122
+ )
123
+ )
124
+ return results
125
+
126
+ def search_docs(
127
+ self,
128
+ query: str,
129
+ top_k: int = 10,
130
+ scope: Literal["names", "contents"] = "contents",
131
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
132
+ ) -> list[SearchHit]:
133
+ """Search across all documents.
134
+
135
+ Args:
136
+ query: Search query text.
137
+ top_k: Maximum number of results.
138
+ scope: ``"contents"`` to search within documents, ``"names"`` to search names only.
139
+ strategy: Search strategy (only for ``scope="contents"``).
140
+
141
+ Returns:
142
+ Ranked list of matching documents.
143
+ """
144
+ if scope == "names":
145
+ return self._search_by_name(query, top_k)
146
+
147
+ chunks = self._search_chunks(query, top_k=top_k * 3, strategy=strategy)
148
+
149
+ # Aggregate chunks by document
150
+ doc_scores: dict[str, float] = {}
151
+ doc_snippets: dict[str, str] = {}
152
+ for chunk, score in chunks:
153
+ if chunk.doc_id not in doc_scores or score > doc_scores[chunk.doc_id]:
154
+ doc_scores[chunk.doc_id] = score
155
+ doc_snippets[chunk.doc_id] = chunk.text[:200]
156
+
157
+ results = []
158
+ for doc_id, score in sorted(doc_scores.items(), key=lambda x: x[1], reverse=True):
159
+ doc = self._doc_store.get(doc_id)
160
+ if doc is None:
161
+ continue
162
+ results.append(
163
+ SearchHit(
164
+ id=doc.id,
165
+ name=doc.name,
166
+ num_lines=doc.num_lines,
167
+ table_of_contents=doc.format_toc(),
168
+ score=score,
169
+ snippet=doc_snippets.get(doc_id, ""),
170
+ )
171
+ )
172
+
173
+ return results[:top_k]
174
+
175
+ def search_doc_contents(
176
+ self,
177
+ doc_id: str,
178
+ query: str,
179
+ top_k: int = 5,
180
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
181
+ ) -> list[ContentSearchHit]:
182
+ """Search within a specific document.
183
+
184
+ Args:
185
+ doc_id: Document identifier.
186
+ query: Search query text.
187
+ top_k: Maximum number of results.
188
+ strategy: Search strategy.
189
+
190
+ Returns:
191
+ List of matching content fragments.
192
+ """
193
+ doc = self._doc_store.get(doc_id)
194
+ if doc is None:
195
+ raise ValueError(f"Document not found: {doc_id}")
196
+
197
+ chunks = self._search_chunks(query, top_k=top_k, strategy=strategy, doc_id=doc_id)
198
+
199
+ return [
200
+ ContentSearchHit(
201
+ doc_id=chunk.doc_id,
202
+ line_range=(chunk.start_line, chunk.end_line),
203
+ text=chunk.text,
204
+ score=score,
205
+ )
206
+ for chunk, score in chunks
207
+ ]
208
+
209
+ def read_doc(
210
+ self,
211
+ doc_id: str,
212
+ start_line: int = 1,
213
+ end_line: int = 100,
214
+ ) -> Excerpt:
215
+ """Read a range of lines from a document.
216
+
217
+ Args:
218
+ doc_id: Document identifier.
219
+ start_line: Starting line number (1-indexed).
220
+ end_line: Ending line number (1-indexed, inclusive).
221
+
222
+ Returns:
223
+ An ``Excerpt`` with the requested lines.
224
+ """
225
+ doc = self._doc_store.get(doc_id)
226
+ if doc is None:
227
+ raise ValueError(f"Document not found: {doc_id}")
228
+
229
+ md_path = Path(doc.path_to_md)
230
+ lines = md_path.read_text().split("\n")
231
+
232
+ start_idx = max(0, start_line - 1)
233
+ end_idx = min(len(lines), end_line)
234
+ selected = lines[start_idx:end_idx]
235
+
236
+ return Excerpt(
237
+ doc_id=doc_id,
238
+ line_range=(start_idx + 1, end_idx),
239
+ text="\n".join(selected),
240
+ total_lines=len(lines),
241
+ )
242
+
243
+ def _search_by_name(self, query: str, top_k: int) -> list[SearchHit]:
244
+ query_lower = query.lower()
245
+ scored = []
246
+ for doc in self._doc_store.list_all():
247
+ name_lower = doc.name.lower()
248
+ if query_lower in name_lower:
249
+ # Simple relevance: exact match > contains
250
+ score = 1.0 if query_lower == name_lower else 0.5
251
+ scored.append((doc, score))
252
+
253
+ scored.sort(key=lambda x: x[1], reverse=True)
254
+ return [
255
+ SearchHit(
256
+ id=doc.id,
257
+ name=doc.name,
258
+ num_lines=doc.num_lines,
259
+ table_of_contents=doc.format_toc(),
260
+ score=score,
261
+ )
262
+ for doc, score in scored[:top_k]
263
+ ]
264
+
265
+ def _search_chunks(
266
+ self,
267
+ query: str,
268
+ top_k: int,
269
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
270
+ doc_id: str | None = None,
271
+ ) -> list[tuple[ChunkMetadata, float]]:
272
+ """Internal: run search using the given strategy."""
273
+ if strategy == "bm25":
274
+ return self._bm25.search(query, top_k=top_k, doc_id=doc_id)
275
+
276
+ if strategy == "vector":
277
+ return self._vector.search(query, top_k=top_k, doc_id=doc_id)
278
+
279
+ # hybrid
280
+ bm25_results = self._bm25.search(query, top_k=top_k, doc_id=doc_id)
281
+ vector_results = self._vector.search(query, top_k=top_k, doc_id=doc_id)
282
+ return reciprocal_rank_fusion(
283
+ [bm25_results, vector_results],
284
+ k=self._config.rrf_k,
285
+ top_k=top_k,
286
+ )
athenaeum/chunker.py ADDED
@@ -0,0 +1,77 @@
1
+ """Line-aware markdown chunking with heading-boundary snapping."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from athenaeum.models import ChunkMetadata
8
+
9
+ _HEADING_RE = re.compile(r"^#{1,6}\s+")
10
+
11
+
12
+ def _find_heading_lines(lines: list[str]) -> set[int]:
13
+ """Return 0-indexed line numbers that are markdown headings."""
14
+ return {i for i, line in enumerate(lines) if _HEADING_RE.match(line.strip())}
15
+
16
+
17
+ def chunk_markdown(
18
+ markdown: str,
19
+ doc_id: str,
20
+ chunk_size: int = 80,
21
+ chunk_overlap: int = 20,
22
+ ) -> list[ChunkMetadata]:
23
+ """Split markdown into overlapping, line-based chunks.
24
+
25
+ Chunks are snapped to heading boundaries when a heading falls within the
26
+ overlap region, so sections start cleanly.
27
+
28
+ Args:
29
+ markdown: Full markdown text.
30
+ doc_id: Parent document ID.
31
+ chunk_size: Target number of lines per chunk.
32
+ chunk_overlap: Number of overlapping lines between consecutive chunks.
33
+
34
+ Returns:
35
+ List of ``ChunkMetadata`` instances.
36
+ """
37
+ if not markdown:
38
+ return []
39
+
40
+ lines = markdown.split("\n")
41
+ total = len(lines)
42
+
43
+ heading_lines = _find_heading_lines(lines)
44
+ chunks: list[ChunkMetadata] = []
45
+ start = 0
46
+ chunk_index = 0
47
+
48
+ while start < total:
49
+ end = min(start + chunk_size, total)
50
+ text = "\n".join(lines[start:end])
51
+
52
+ chunks.append(
53
+ ChunkMetadata(
54
+ doc_id=doc_id,
55
+ chunk_index=chunk_index,
56
+ start_line=start + 1, # 1-indexed
57
+ end_line=end, # 1-indexed inclusive
58
+ text=text,
59
+ )
60
+ )
61
+ chunk_index += 1
62
+
63
+ if end >= total:
64
+ break
65
+
66
+ # Compute next start with overlap
67
+ next_start = end - chunk_overlap
68
+
69
+ # Snap to heading if one exists in the overlap zone [next_start, end)
70
+ for line_idx in range(next_start, end):
71
+ if line_idx in heading_lines:
72
+ next_start = line_idx
73
+ break
74
+
75
+ start = next_start
76
+
77
+ return chunks
athenaeum/config.py ADDED
@@ -0,0 +1,16 @@
1
+ """Configuration for Athenaeum."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+
8
+ @dataclass
9
+ class AthenaeumConfig:
10
+ """Configuration for an Athenaeum instance."""
11
+
12
+ storage_dir: Path = field(default_factory=lambda: Path.home() / ".athenaeum")
13
+ chunk_size: int = 80
14
+ chunk_overlap: int = 20
15
+ rrf_k: int = 60
16
+ default_strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
@@ -0,0 +1,53 @@
1
+ """JSON-backed document registry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from athenaeum.models import Document
6
+ from athenaeum.storage import StorageManager
7
+
8
+
9
+ class DocumentStore:
10
+ """Manages the document registry backed by ``metadata.json``."""
11
+
12
+ def __init__(self, storage: StorageManager) -> None:
13
+ self._storage = storage
14
+ self._docs: dict[str, Document] = {}
15
+ self._load()
16
+
17
+ def _load(self) -> None:
18
+ raw = self._storage.load_metadata()
19
+ docs_raw = raw.get("documents", {})
20
+ for doc_id, data in docs_raw.items():
21
+ self._docs[doc_id] = Document.model_validate(data)
22
+
23
+ def _save(self) -> None:
24
+ data = {
25
+ "documents": {
26
+ doc_id: doc.model_dump(mode="json") for doc_id, doc in self._docs.items()
27
+ }
28
+ }
29
+ self._storage.save_metadata(data)
30
+
31
+ def add(self, doc: Document) -> None:
32
+ """Add or update a document in the registry."""
33
+ self._docs[doc.id] = doc
34
+ self._save()
35
+
36
+ def get(self, doc_id: str) -> Document | None:
37
+ """Get a document by ID, or None if not found."""
38
+ return self._docs.get(doc_id)
39
+
40
+ def list_all(self) -> list[Document]:
41
+ """Return all documents."""
42
+ return list(self._docs.values())
43
+
44
+ def remove(self, doc_id: str) -> Document | None:
45
+ """Remove a document from the registry. Returns the removed doc or None."""
46
+ doc = self._docs.pop(doc_id, None)
47
+ if doc is not None:
48
+ self._save()
49
+ return doc
50
+
51
+ @property
52
+ def count(self) -> int:
53
+ return len(self._docs)
athenaeum/models.py ADDED
@@ -0,0 +1,88 @@
1
+ """Pydantic data models for Athenaeum."""
2
+
3
+ from datetime import UTC, datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class Metadata(BaseModel):
9
+ """Basic document metadata."""
10
+
11
+ id: str = Field(..., description="Unique document identifier")
12
+ name: str = Field(..., description="Document display name")
13
+
14
+
15
+ class TOCEntry(BaseModel):
16
+ """Table of contents entry with line range."""
17
+
18
+ title: str = Field(..., description="Section title")
19
+ level: int = Field(..., description="Header level (1 for h1, 2 for h2)")
20
+ start_line: int = Field(..., description="Starting line number (1-indexed)")
21
+ end_line: int | None = Field(None, description="Ending line number (1-indexed, inclusive)")
22
+
23
+
24
+ class Document(BaseModel):
25
+ """Full document record stored in the knowledge base."""
26
+
27
+ id: str = Field(..., description="Unique document identifier (UUID)")
28
+ name: str = Field(..., description="Original filename")
29
+ path_to_raw: str = Field(..., description="Path to original file")
30
+ path_to_md: str = Field(..., description="Path to converted markdown")
31
+ num_lines: int = Field(..., description="Total number of lines in markdown")
32
+ table_of_contents: list[TOCEntry] = Field(
33
+ default_factory=list, description="Parsed table of contents"
34
+ )
35
+ created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
36
+ file_size: int = Field(0, description="Original file size in bytes")
37
+ file_type: str = Field("", description="Original file extension")
38
+
39
+ def format_toc(self) -> str:
40
+ """Format table of contents as a readable string."""
41
+ if not self.table_of_contents:
42
+ return "No table of contents available"
43
+
44
+ lines = []
45
+ for entry in self.table_of_contents:
46
+ indent = " " * (entry.level - 1)
47
+ line_info = f"[lines {entry.start_line}-{entry.end_line or '?'}]"
48
+ lines.append(f"{indent}- {entry.title} {line_info}")
49
+ return "\n".join(lines)
50
+
51
+
52
+ class SearchHit(BaseModel):
53
+ """Search result for document-level search."""
54
+
55
+ id: str = Field(..., description="Document identifier")
56
+ name: str = Field(..., description="Document name")
57
+ num_lines: int = Field(..., description="Total lines in document")
58
+ table_of_contents: str = Field(..., description="Formatted table of contents")
59
+ score: float = Field(default=0.0, description="Search relevance score")
60
+ snippet: str = Field(default="", description="Relevant text snippet")
61
+
62
+
63
+ class Excerpt(BaseModel):
64
+ """Text excerpt from a document."""
65
+
66
+ doc_id: str = Field(..., description="Document identifier")
67
+ line_range: tuple[int, int] = Field(..., description="Line range (start, end), 1-indexed")
68
+ text: str = Field(..., description="Extracted text content")
69
+ total_lines: int = Field(0, description="Total lines in document")
70
+
71
+
72
+ class ContentSearchHit(BaseModel):
73
+ """Search result for within-document content search."""
74
+
75
+ doc_id: str = Field(..., description="Document identifier")
76
+ line_range: tuple[int, int] = Field(..., description="Line range of the match")
77
+ text: str = Field(..., description="Matching text content")
78
+ score: float = Field(0.0, description="Search relevance score")
79
+
80
+
81
+ class ChunkMetadata(BaseModel):
82
+ """Metadata stored with each chunk in the vector store."""
83
+
84
+ doc_id: str = Field(..., description="Parent document ID")
85
+ chunk_index: int = Field(..., description="Chunk index within document")
86
+ start_line: int = Field(..., description="Starting line number")
87
+ end_line: int = Field(..., description="Ending line number")
88
+ text: str = Field(..., description="Chunk text content")
@@ -0,0 +1,48 @@
1
+ """OCR provider registry and factory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from athenaeum.ocr.base import OCRProvider
8
+ from athenaeum.ocr.custom import CustomOCR
9
+ from athenaeum.ocr.markitdown import MarkitdownOCR
10
+
11
+ OCRBackend = Literal["markitdown", "docling", "mistral", "lighton"]
12
+
13
+
14
+ def get_ocr_provider(backend: OCRBackend = "markitdown", **kwargs: object) -> OCRProvider:
15
+ """Factory to create an OCR provider by name.
16
+
17
+ Args:
18
+ backend: One of ``"markitdown"``, ``"docling"``, ``"mistral"``, ``"lighton"``.
19
+ **kwargs: Passed to the provider constructor.
20
+
21
+ Returns:
22
+ An ``OCRProvider`` instance.
23
+ """
24
+ if backend == "markitdown":
25
+ return MarkitdownOCR()
26
+ if backend == "docling":
27
+ from athenaeum.ocr.docling import DoclingOCR
28
+
29
+ return DoclingOCR()
30
+ if backend == "mistral":
31
+ from athenaeum.ocr.mistral import MistralOCR
32
+
33
+ return MistralOCR(**kwargs) # type: ignore[arg-type]
34
+ if backend == "lighton":
35
+ from athenaeum.ocr.lighton import LightOnOCR
36
+
37
+ return LightOnOCR(**kwargs) # type: ignore[arg-type]
38
+
39
+ raise ValueError(f"Unknown OCR backend: {backend!r}")
40
+
41
+
42
+ __all__ = [
43
+ "CustomOCR",
44
+ "MarkitdownOCR",
45
+ "OCRBackend",
46
+ "OCRProvider",
47
+ "get_ocr_provider",
48
+ ]
athenaeum/ocr/base.py ADDED
@@ -0,0 +1,28 @@
1
+ """Abstract base class for OCR providers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+
8
+
9
+ class OCRProvider(ABC):
10
+ """Base class for OCR/document-to-markdown converters."""
11
+
12
+ @abstractmethod
13
+ def convert(self, file_path: Path) -> str:
14
+ """Convert a file to markdown text.
15
+
16
+ Args:
17
+ file_path: Path to the source file.
18
+
19
+ Returns:
20
+ Markdown string of the file contents.
21
+ """
22
+
23
+ @abstractmethod
24
+ def supported_extensions(self) -> set[str]:
25
+ """Return the set of file extensions this provider supports.
26
+
27
+ Extensions should include the leading dot, e.g. ``{".pdf", ".docx"}``.
28
+ """
@@ -0,0 +1,26 @@
1
+ """Custom OCR provider wrapping a user-supplied callable."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+
8
+ from athenaeum.ocr.base import OCRProvider
9
+
10
+
11
+ class CustomOCR(OCRProvider):
12
+ """Wrap an arbitrary ``(Path) -> str`` callable as an OCR provider."""
13
+
14
+ def __init__(
15
+ self,
16
+ fn: Callable[[Path], str],
17
+ extensions: set[str] | None = None,
18
+ ) -> None:
19
+ self._fn = fn
20
+ self._extensions = extensions or {".*"}
21
+
22
+ def convert(self, file_path: Path) -> str:
23
+ return self._fn(file_path)
24
+
25
+ def supported_extensions(self) -> set[str]:
26
+ return self._extensions
@@ -0,0 +1,30 @@
1
+ """OCR provider using Docling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from athenaeum.ocr.base import OCRProvider
8
+
9
+
10
+ class DoclingOCR(OCRProvider):
11
+ """Convert documents to markdown using Docling."""
12
+
13
+ _EXTENSIONS = {".pdf", ".pptx", ".docx", ".xlsx", ".html", ".md"}
14
+
15
+ def __init__(self) -> None:
16
+ try:
17
+ from docling.document_converter import DocumentConverter
18
+ except ImportError as e:
19
+ raise ImportError(
20
+ "Docling is not installed. Install with: pip install 'athenaeum-kb[docling]'"
21
+ ) from e
22
+ self._converter = DocumentConverter()
23
+
24
+ def convert(self, file_path: Path) -> str:
25
+ result = self._converter.convert(str(file_path))
26
+ md: str = result.document.export_to_markdown()
27
+ return md
28
+
29
+ def supported_extensions(self) -> set[str]:
30
+ return self._EXTENSIONS
@@ -0,0 +1,40 @@
1
+ """OCR provider using LightOnOCR-2-1B."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from athenaeum.ocr.base import OCRProvider
8
+
9
+
10
+ class LightOnOCR(OCRProvider):
11
+ """Convert documents to markdown using LightOnOCR-2-1B (local model)."""
12
+
13
+ _EXTENSIONS = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
14
+
15
+ def __init__(self, device: str = "cpu") -> None:
16
+ try:
17
+ from transformers import pipeline
18
+ except ImportError as e:
19
+ raise ImportError(
20
+ "transformers/torch not installed. "
21
+ "Install with: pip install 'athenaeum-kb[lighton]'"
22
+ ) from e
23
+ self._pipe = pipeline(
24
+ "image-text-to-text",
25
+ model="lightonai/LightOnOCR-2-1B",
26
+ device=device,
27
+ )
28
+
29
+ def convert(self, file_path: Path) -> str:
30
+ from PIL import Image
31
+
32
+ image = Image.open(file_path)
33
+ result = self._pipe(image)
34
+ if isinstance(result, list) and len(result) > 0:
35
+ text: str = result[0].get("generated_text", "")
36
+ return text
37
+ return str(result)
38
+
39
+ def supported_extensions(self) -> set[str]:
40
+ return self._EXTENSIONS
@@ -0,0 +1,29 @@
1
+ """OCR provider using Microsoft markitdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from markitdown import MarkItDown
8
+
9
+ from athenaeum.ocr.base import OCRProvider
10
+
11
+
12
+ class MarkitdownOCR(OCRProvider):
13
+ """Convert documents to markdown using markitdown."""
14
+
15
+ _EXTENSIONS = {
16
+ ".pdf", ".pptx", ".docx", ".xlsx",
17
+ ".json", ".csv", ".txt", ".md",
18
+ ".html", ".xml", ".rtf", ".epub",
19
+ }
20
+
21
+ def __init__(self) -> None:
22
+ self._converter = MarkItDown()
23
+
24
+ def convert(self, file_path: Path) -> str:
25
+ result = self._converter.convert(str(file_path))
26
+ return result.text_content
27
+
28
+ def supported_extensions(self) -> set[str]:
29
+ return self._EXTENSIONS
@@ -0,0 +1,44 @@
1
+ """OCR provider using Mistral OCR API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import os
7
+ from pathlib import Path
8
+
9
+ from athenaeum.ocr.base import OCRProvider
10
+
11
+
12
+ class MistralOCR(OCRProvider):
13
+ """Convert documents to markdown using Mistral's OCR API."""
14
+
15
+ _EXTENSIONS = {".pdf"}
16
+
17
+ def __init__(self, api_key: str | None = None) -> None:
18
+ try:
19
+ from mistralai import Mistral
20
+ except ImportError as e:
21
+ raise ImportError(
22
+ "Mistral SDK is not installed. Install with: pip install 'athenaeum-kb[mistral]'"
23
+ ) from e
24
+ key = api_key or os.environ.get("MISTRAL_API_KEY", "")
25
+ if not key:
26
+ raise ValueError("MISTRAL_API_KEY must be set or passed explicitly.")
27
+ self._client = Mistral(api_key=key)
28
+
29
+ def convert(self, file_path: Path) -> str:
30
+ from mistralai import DocumentURLChunk, OCRRequest
31
+
32
+ encoded = base64.standard_b64encode(file_path.read_bytes()).decode()
33
+ data_uri = f"data:application/pdf;base64,{encoded}"
34
+ response = self._client.ocr.process(
35
+ request=OCRRequest(
36
+ document=DocumentURLChunk(document_url=data_uri),
37
+ model="mistral-ocr-latest",
38
+ )
39
+ )
40
+ pages = [page.markdown for page in response.pages]
41
+ return "\n\n".join(pages)
42
+
43
+ def supported_extensions(self) -> set[str]:
44
+ return self._EXTENSIONS
@@ -0,0 +1,7 @@
1
+ """Search subsystem for Athenaeum."""
2
+
3
+ from athenaeum.search.bm25 import BM25Index
4
+ from athenaeum.search.hybrid import reciprocal_rank_fusion
5
+ from athenaeum.search.vector import VectorIndex
6
+
7
+ __all__ = ["BM25Index", "VectorIndex", "reciprocal_rank_fusion"]
@@ -0,0 +1,72 @@
1
+ """BM25 keyword search index."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from rank_bm25 import BM25Okapi
8
+
9
+ from athenaeum.models import ChunkMetadata
10
+
11
+
12
+ @dataclass
13
+ class _Entry:
14
+ chunk: ChunkMetadata
15
+ tokens: list[str]
16
+
17
+
18
+ class BM25Index:
19
+ """In-memory BM25 index over document chunks."""
20
+
21
+ def __init__(self) -> None:
22
+ self._entries: list[_Entry] = []
23
+ self._bm25: BM25Okapi | None = None
24
+
25
+ def _rebuild(self) -> None:
26
+ if self._entries:
27
+ corpus = [e.tokens for e in self._entries]
28
+ self._bm25 = BM25Okapi(corpus)
29
+ else:
30
+ self._bm25 = None
31
+
32
+ @staticmethod
33
+ def _tokenize(text: str) -> list[str]:
34
+ return text.lower().split()
35
+
36
+ def add_chunks(self, chunks: list[ChunkMetadata]) -> None:
37
+ """Add chunks to the index and rebuild."""
38
+ for chunk in chunks:
39
+ self._entries.append(_Entry(chunk=chunk, tokens=self._tokenize(chunk.text)))
40
+ self._rebuild()
41
+
42
+ def remove_document(self, doc_id: str) -> None:
43
+ """Remove all chunks for a document and rebuild."""
44
+ self._entries = [e for e in self._entries if e.chunk.doc_id != doc_id]
45
+ self._rebuild()
46
+
47
+ def search(
48
+ self,
49
+ query: str,
50
+ top_k: int = 10,
51
+ doc_id: str | None = None,
52
+ ) -> list[tuple[ChunkMetadata, float]]:
53
+ """Search the index, returning (chunk, score) pairs sorted by score descending."""
54
+ if not self._bm25 or not self._entries:
55
+ return []
56
+
57
+ tokens = self._tokenize(query)
58
+ scores = self._bm25.get_scores(tokens)
59
+
60
+ results: list[tuple[ChunkMetadata, float]] = []
61
+ for i, score in enumerate(scores):
62
+ entry = self._entries[i]
63
+ if doc_id is not None and entry.chunk.doc_id != doc_id:
64
+ continue
65
+ results.append((entry.chunk, float(score)))
66
+
67
+ results.sort(key=lambda x: x[1], reverse=True)
68
+ return results[:top_k]
69
+
70
+ @property
71
+ def size(self) -> int:
72
+ return len(self._entries)
@@ -0,0 +1,37 @@
1
+ """Reciprocal Rank Fusion for combining ranked search results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from athenaeum.models import ChunkMetadata
6
+
7
+
8
+ def reciprocal_rank_fusion(
9
+ ranked_lists: list[list[tuple[ChunkMetadata, float]]],
10
+ k: int = 60,
11
+ top_k: int = 10,
12
+ ) -> list[tuple[ChunkMetadata, float]]:
13
+ """Combine multiple ranked lists using Reciprocal Rank Fusion.
14
+
15
+ RRF score for a chunk = sum over lists of 1 / (k + rank), where rank is
16
+ 1-indexed. Higher scores indicate better combined relevance.
17
+
18
+ Args:
19
+ ranked_lists: List of ranked result lists, each containing (chunk, score) pairs.
20
+ k: RRF constant (default 60).
21
+ top_k: Number of results to return.
22
+
23
+ Returns:
24
+ Merged and re-ranked list of (chunk, rrf_score) pairs.
25
+ """
26
+ scores: dict[str, float] = {}
27
+ chunks: dict[str, ChunkMetadata] = {}
28
+
29
+ for ranked_list in ranked_lists:
30
+ for rank, (chunk, _score) in enumerate(ranked_list, start=1):
31
+ key = f"{chunk.doc_id}:{chunk.chunk_index}"
32
+ scores[key] = scores.get(key, 0.0) + 1.0 / (k + rank)
33
+ chunks[key] = chunk
34
+
35
+ merged = [(chunks[key], score) for key, score in scores.items()]
36
+ merged.sort(key=lambda x: x[1], reverse=True)
37
+ return merged[:top_k]
@@ -0,0 +1,72 @@
1
+ """Vector similarity search index using LangChain + Chroma."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from langchain_chroma import Chroma
8
+ from langchain_core.embeddings import Embeddings
9
+
10
+ from athenaeum.models import ChunkMetadata
11
+
12
+
13
+ class VectorIndex:
14
+ """Vector similarity search backed by Chroma."""
15
+
16
+ def __init__(
17
+ self,
18
+ embeddings: Embeddings,
19
+ persist_directory: str | Path | None = None,
20
+ collection_name: str = "athenaeum",
21
+ ) -> None:
22
+ kwargs: dict[str, object] = {
23
+ "embedding_function": embeddings,
24
+ "collection_name": collection_name,
25
+ }
26
+ if persist_directory is not None:
27
+ kwargs["persist_directory"] = str(persist_directory)
28
+ self._store = Chroma(**kwargs) # type: ignore[arg-type]
29
+ self._embeddings = embeddings
30
+
31
+ def add_chunks(self, chunks: list[ChunkMetadata]) -> None:
32
+ """Add chunks to the vector store."""
33
+ if not chunks:
34
+ return
35
+ texts = [c.text for c in chunks]
36
+ metadatas = [c.model_dump() for c in chunks]
37
+ ids = [f"{c.doc_id}:{c.chunk_index}" for c in chunks]
38
+ self._store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
39
+
40
+ def remove_document(self, doc_id: str) -> None:
41
+ """Remove all chunks for a document from the vector store."""
42
+ self._store._collection.delete(where={"doc_id": doc_id})
43
+
44
+ def search(
45
+ self,
46
+ query: str,
47
+ top_k: int = 10,
48
+ doc_id: str | None = None,
49
+ ) -> list[tuple[ChunkMetadata, float]]:
50
+ """Search for similar chunks, returning (chunk, score) pairs.
51
+
52
+ Scores are similarity scores (higher = more similar).
53
+ """
54
+ kwargs: dict[str, object] = {"k": top_k}
55
+ if doc_id is not None:
56
+ kwargs["filter"] = {"doc_id": doc_id}
57
+
58
+ results = self._store.similarity_search_with_relevance_scores(query, **kwargs) # type: ignore[arg-type]
59
+
60
+ output: list[tuple[ChunkMetadata, float]] = []
61
+ for doc, score in results:
62
+ meta = doc.metadata
63
+ chunk = ChunkMetadata(
64
+ doc_id=meta["doc_id"],
65
+ chunk_index=meta["chunk_index"],
66
+ start_line=meta["start_line"],
67
+ end_line=meta["end_line"],
68
+ text=meta["text"],
69
+ )
70
+ output.append((chunk, float(score)))
71
+
72
+ return output
athenaeum/storage.py ADDED
@@ -0,0 +1,68 @@
1
+ """Storage layout manager for Athenaeum."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import shutil
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ class StorageManager:
12
+ """Manages the on-disk layout under the storage root.
13
+
14
+ Layout::
15
+
16
+ <root>/
17
+ docs/<doc_id>/raw.* # original file
18
+ docs/<doc_id>/content.md # converted markdown
19
+ index/chroma/ # Chroma persistent directory
20
+ metadata.json # document registry
21
+ """
22
+
23
+ def __init__(self, root: Path) -> None:
24
+ self.root = root
25
+ self.root.mkdir(parents=True, exist_ok=True)
26
+
27
+ @property
28
+ def docs_dir(self) -> Path:
29
+ return self.root / "docs"
30
+
31
+ @property
32
+ def chroma_dir(self) -> Path:
33
+ return self.root / "index" / "chroma"
34
+
35
+ @property
36
+ def metadata_path(self) -> Path:
37
+ return self.root / "metadata.json"
38
+
39
+ def doc_dir(self, doc_id: str) -> Path:
40
+ d = self.docs_dir / doc_id
41
+ d.mkdir(parents=True, exist_ok=True)
42
+ return d
43
+
44
+ def raw_path(self, doc_id: str, suffix: str) -> Path:
45
+ """Return path for storing the original file."""
46
+ return self.doc_dir(doc_id) / f"raw{suffix}"
47
+
48
+ def content_md_path(self, doc_id: str) -> Path:
49
+ """Return path for the converted markdown."""
50
+ return self.doc_dir(doc_id) / "content.md"
51
+
52
+ def remove_doc(self, doc_id: str) -> None:
53
+ """Remove a document's directory."""
54
+ d = self.docs_dir / doc_id
55
+ if d.exists():
56
+ shutil.rmtree(d)
57
+
58
+ def ensure_chroma_dir(self) -> Path:
59
+ self.chroma_dir.mkdir(parents=True, exist_ok=True)
60
+ return self.chroma_dir
61
+
62
+ def load_metadata(self) -> dict[str, Any]:
63
+ if self.metadata_path.exists():
64
+ return json.loads(self.metadata_path.read_text()) # type: ignore[no-any-return]
65
+ return {}
66
+
67
+ def save_metadata(self, data: dict[str, Any]) -> None:
68
+ self.metadata_path.write_text(json.dumps(data, indent=2, default=str))
athenaeum/toc.py ADDED
@@ -0,0 +1,49 @@
1
+ """Table of contents extraction from markdown headings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from athenaeum.models import TOCEntry
8
+
9
+ _HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
10
+
11
+
12
+ def extract_toc(markdown: str) -> list[TOCEntry]:
13
+ """Extract a table of contents from markdown heading lines.
14
+
15
+ Each entry records the heading level, title, and the line range it spans.
16
+ Line numbers are 1-indexed. The ``end_line`` of each entry is set to the
17
+ line before the next heading at the same or higher level, or to the last
18
+ line of the document for the final entry.
19
+ """
20
+ lines = markdown.split("\n")
21
+ entries: list[TOCEntry] = []
22
+
23
+ for line_no_0, line in enumerate(lines):
24
+ m = _HEADING_RE.match(line.strip())
25
+ if m:
26
+ level = len(m.group(1))
27
+ title = m.group(2).strip()
28
+ entries.append(
29
+ TOCEntry(title=title, level=level, start_line=line_no_0 + 1, end_line=None)
30
+ )
31
+
32
+ # Fill in end_line for each entry
33
+ total_lines = len(lines)
34
+ for i, entry in enumerate(entries):
35
+ # Find next entry at same or higher (lower number) level
36
+ end = total_lines
37
+ for j in range(i + 1, len(entries)):
38
+ if entries[j].level <= entry.level:
39
+ end = entries[j].start_line - 1
40
+ break
41
+ else:
42
+ # Last entry at this level — ends at document end
43
+ if i + 1 < len(entries):
44
+ end = entries[-1].start_line - 1
45
+ # Actually, just go to end of document
46
+ end = total_lines
47
+ entry.end_line = end
48
+
49
+ return entries
@@ -0,0 +1,165 @@
1
+ Metadata-Version: 2.4
2
+ Name: athenaeum-kb
3
+ Version: 0.1.0
4
+ Summary: Tools for intelligent interaction with knowledge bases
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: langchain-chroma>=0.2
8
+ Requires-Dist: langchain-core>=0.3
9
+ Requires-Dist: langchain-openai>=0.3
10
+ Requires-Dist: markitdown>=0.1
11
+ Requires-Dist: pydantic>=2.0
12
+ Requires-Dist: rank-bm25>=0.2.2
13
+ Provides-Extra: all-ocr
14
+ Requires-Dist: docling>=2.0; extra == 'all-ocr'
15
+ Requires-Dist: mistralai>=1.0; extra == 'all-ocr'
16
+ Requires-Dist: pillow>=10.0; extra == 'all-ocr'
17
+ Requires-Dist: torch>=2.0; extra == 'all-ocr'
18
+ Requires-Dist: transformers>=4.40; extra == 'all-ocr'
19
+ Provides-Extra: dev
20
+ Requires-Dist: mypy>=1.10; extra == 'dev'
21
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Requires-Dist: ruff>=0.4; extra == 'dev'
24
+ Provides-Extra: docling
25
+ Requires-Dist: docling>=2.0; extra == 'docling'
26
+ Provides-Extra: lighton
27
+ Requires-Dist: pillow>=10.0; extra == 'lighton'
28
+ Requires-Dist: torch>=2.0; extra == 'lighton'
29
+ Requires-Dist: transformers>=4.40; extra == 'lighton'
30
+ Provides-Extra: mistral
31
+ Requires-Dist: mistralai>=1.0; extra == 'mistral'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Athenaeum
35
+
36
+ ## Project Scope and Goals
37
+
38
+ The goal of this project is to build a Python library that equips AI agents with a robust set of tools for intelligent interaction with knowledge bases. The library focuses on document ingestion, semantic search, and structured access to content, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
39
+
40
+ Once the module is fully tested and validated, the intended outcome is to package and publish it as a reusable Python library on PyPI.
41
+
42
+ ## Tools
43
+
44
+ ### `load_doc`
45
+ Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
46
+
47
+ ```python
48
+ load_doc(path: str) -> str
49
+ ```
50
+
51
+ **Parameters:**
52
+ - `path`: Path to the document file
53
+
54
+ **Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
55
+
56
+ **Returns**: A document identifier (doc_id) that can be used for subsequent operations.
57
+
58
+ ### `list_docs`
59
+ List all documents currently stored in the knowledge base.
60
+
61
+ ```python
62
+ list_docs() -> list[DocSearchHit]
63
+ ```
64
+
65
+ **Returns:** A list of documents, including metadata (id, name, format, etc.) and, when available, a table of contents.
66
+
67
+ ### `search_docs`
68
+
69
+ Search across all documents in the knowledge base.
70
+
71
+ ```python
72
+ search_docs(
73
+ query: str,
74
+ top_k: int = 10,
75
+ scope: Literal["names", "contents"] = "contents",
76
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
77
+ ) -> list[DocSearchHit]
78
+ ```
79
+
80
+ **Parameters:**
81
+ - `query`: Search query text
82
+ - `top_k`: Maximum number of results (default: 10)
83
+ - `scope`: Where to search
84
+ - `"contents"`: Search within document contents (default)
85
+ - `"names"`: Search only document names
86
+ - `strategy`: Search strategy (only applies when scope is "contents")
87
+ - `"hybrid"`: Combines vector and BM25 search (default)
88
+ - `"bm25"`: Keyword-based search only
89
+ - `"vector"`: Semantic similarity search only
90
+
91
+ **Returns**: A ranked list of documents matching the query.
92
+
93
+ ### `search_doc_contents`
94
+
95
+ Search within a specific document.
96
+
97
+ ```python
98
+ search_doc_contents(
99
+ doc_id: str,
100
+ query: str,
101
+ top_k: int = 5,
102
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid"
103
+ ) -> list[ContentSearchHit]
104
+ ```
105
+
106
+ **Parameters:**
107
+ - `doc_id`: Document identifier
108
+ - `query`: Search query text
109
+ - `top_k`: Maximum number of results (default: 5)
110
+ - `strategy`: Search strategy
111
+ - `"hybrid"`: Combines vector and BM25 search (default)
112
+ - `"bm25"`: Keyword-based search only
113
+ - `"vector"`: Semantic similarity search only
114
+
115
+ **Returns**: A list of matching content fragments with relevance scores.
116
+
117
+ ### `read_doc`
118
+
119
+ Read a specific range of lines from a document.
120
+
121
+ ```python
122
+ read_doc(
123
+ doc_id: str,
124
+ start_line: int = 1,
125
+ end_line: int = 100
126
+ ) -> Excerpt
127
+ ```
128
+
129
+ **Parameters:**
130
+ - `doc_id`: Document identifier
131
+ - `start_line`: Starting line number (1-indexed, default: 1)
132
+ - `end_line`: Ending line number (1-indexed, inclusive, default: 100)
133
+
134
+ **Returns**: A document excerpt containing the requested lines.
135
+
136
+ ## Search Strategies
137
+
138
+ - **Hybrid Search** (Default): Combines vector similarity search with BM25 keyword search using Reciprocal Rank Fusion (RRF).
139
+ - **Vector Search**: Uses embedding models for semantic similarity search.
140
+ - **BM25 Search**: Traditional keyword-based search using the BM25 algorithm.
141
+
142
+ ## Document Ingestion Workflow
143
+
144
+ The ingestion pipeline is triggered by the load_doc(path) function and follows these steps:
145
+ 1. Validation
146
+ - Verify that the file exists.
147
+ - Confirm that the file format is supported.
148
+ 2. Content Extraction
149
+ - Run an OCR or parsing pipeline to convert the raw file into Markdown.
150
+ - If the document contains images:
151
+ - Replace them with placeholders in the Markdown.
152
+ - Store image references for later retrieval.
153
+ 3. Pre-processing
154
+ - Generate document metadata (e.g., id, name, format).
155
+ - Build a table of contents (TOC) from Markdown headings when possible.
156
+ 4. Indexing
157
+ - Generate vector embeddings using the configured embedding model.
158
+ - Store embeddings in the vector database for semantic retrieval.
159
+
160
+ ## Data Models
161
+ A preliminary set of domain models is defined in `models.py`. These classes are exploratory and serve as a conceptual starting point for the project.
162
+
163
+ They are not considered final and may be refactored, renamed, or removed as the overall architecture of Athenaeum evolves and solidifies.
164
+
165
+
@@ -0,0 +1,22 @@
1
+ athenaeum/__init__.py,sha256=3oXIneyMISBVERMs4KuJOJ7MrjZj5I7lC19wQaTuzvA,585
2
+ athenaeum/athenaeum.py,sha256=kl-DZnm5qPyY-oVEcJIgZnprobWOumKPziSyz5YlaE8,9330
3
+ athenaeum/chunker.py,sha256=Sk2h5Z3oVhK6BPdXRMWVMONurwIsMgRkRIorbEPl_Po,2050
4
+ athenaeum/config.py,sha256=NcQa5t3sj0XacqmJgWLE88Ci9uHafWy6Z988WE36rqg,441
5
+ athenaeum/document_store.py,sha256=8llflG_k0UFjtr2Xb4YWnDDpFzFvfA_OQZ7WELyP6Qo,1611
6
+ athenaeum/models.py,sha256=ZeVQkPtS5WBPWkdSTYa-Z0ZZ9jtPdqDwnH16803gu54,3647
7
+ athenaeum/storage.py,sha256=4kV-wPtJ-MPS1ApPlr8W66bLYmtYGknnta4-lFHLXuo,2028
8
+ athenaeum/toc.py,sha256=qROrkhUg7JOlLpgHwuM2-JZd8eX2WO4iMsZh0E68zwY,1639
9
+ athenaeum/ocr/__init__.py,sha256=aG2NgzHL8TZXlfw3HDPZI8-t6gNJSTV1iLODIOSLEFE,1304
10
+ athenaeum/ocr/base.py,sha256=qdw--LAUPIloFWuKGaGLD7lPzNttcVL0HOjAJbZBqQk,716
11
+ athenaeum/ocr/custom.py,sha256=pddvXL-3aAR4Fo3nNDxfjVHO0D6xhTmpI6rLPDgazCU,672
12
+ athenaeum/ocr/docling.py,sha256=sPRu6ONiC6L2ZHi7RLYqFHFX6oLjRMsfsSR__9ssJeo,898
13
+ athenaeum/ocr/lighton.py,sha256=oArJ4BvbxByFTYu4HaokKXLngvQdX7_RJIZUlPe6WBc,1197
14
+ athenaeum/ocr/markitdown.py,sha256=M6bCoqsIvm_z_tLQeTq5ie-CaFihPspu0wZUIudrNZ8,727
15
+ athenaeum/ocr/mistral.py,sha256=XMGk1yLhwbdi_glzdsjLdOptbMwmc-86Fate3iSTwUI,1436
16
+ athenaeum/search/__init__.py,sha256=BCmd-4lIZmxVQZTC3GNt5-uz0pI8G9wd35MYmmIWdIA,256
17
+ athenaeum/search/bm25.py,sha256=b-SbtFaXwqt2IzLaytwwSVo8LBm_v0UotJeEeIsEgl8,2069
18
+ athenaeum/search/hybrid.py,sha256=a9kFNq6WBnNZtC6Rocx9ZI5BTe-f53KR8beaVC12cQU,1273
19
+ athenaeum/search/vector.py,sha256=Hu_7gbwiRLYlkpZGUnf0SD9LXWHZ7IxIUEls1w4blFo,2428
20
+ athenaeum_kb-0.1.0.dist-info/METADATA,sha256=5_5bjWHZ3XJLs5Da85lMEgDuNe1IeLB7CmgYFcI1IAo,5549
21
+ athenaeum_kb-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
22
+ athenaeum_kb-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any