athenaeum-kb 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- athenaeum_kb-0.2.0/.claude/settings.local.json +18 -0
- athenaeum_kb-0.2.0/PKG-INFO +309 -0
- athenaeum_kb-0.2.0/README.md +276 -0
- athenaeum_kb-0.2.0/pyproject.toml +69 -0
- athenaeum_kb-0.2.0/src/athenaeum/__init__.py +28 -0
- athenaeum_kb-0.2.0/src/athenaeum/athenaeum.py +339 -0
- athenaeum_kb-0.2.0/src/athenaeum/chunker.py +77 -0
- athenaeum_kb-0.2.0/src/athenaeum/config.py +16 -0
- athenaeum_kb-0.2.0/src/athenaeum/document_store.py +64 -0
- athenaeum_kb-0.2.0/src/athenaeum/models.py +90 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/__init__.py +48 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/base.py +28 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/custom.py +26 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/docling.py +30 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/lighton.py +40 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/markitdown.py +29 -0
- athenaeum_kb-0.2.0/src/athenaeum/ocr/mistral.py +44 -0
- athenaeum_kb-0.2.0/src/athenaeum/search/__init__.py +7 -0
- athenaeum_kb-0.2.0/src/athenaeum/search/bm25.py +75 -0
- athenaeum_kb-0.2.0/src/athenaeum/search/hybrid.py +37 -0
- athenaeum_kb-0.2.0/src/athenaeum/search/vector.py +75 -0
- athenaeum_kb-0.2.0/src/athenaeum/storage.py +68 -0
- athenaeum_kb-0.2.0/src/athenaeum/toc.py +49 -0
- athenaeum_kb-0.2.0/tests/__init__.py +0 -0
- athenaeum_kb-0.2.0/tests/conftest.py +24 -0
- athenaeum_kb-0.2.0/tests/fixtures/sample.md +34 -0
- athenaeum_kb-0.2.0/tests/fixtures/sample.txt +5 -0
- athenaeum_kb-0.2.0/tests/test_athenaeum.py +192 -0
- athenaeum_kb-0.2.0/tests/test_bm25.py +59 -0
- athenaeum_kb-0.2.0/tests/test_chunker.py +45 -0
- athenaeum_kb-0.2.0/tests/test_hybrid.py +38 -0
- athenaeum_kb-0.2.0/tests/test_models.py +72 -0
- athenaeum_kb-0.2.0/tests/test_ocr_base.py +61 -0
- athenaeum_kb-0.2.0/tests/test_storage.py +59 -0
- athenaeum_kb-0.2.0/tests/test_toc.py +43 -0
- athenaeum_kb-0.2.0/tests/test_vector.py +87 -0
- athenaeum_kb-0.2.0/uv.lock +4487 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"WebSearch",
|
|
5
|
+
"Bash(uv sync:*)",
|
|
6
|
+
"Bash(uv run pytest:*)",
|
|
7
|
+
"Bash(uv run ruff check:*)",
|
|
8
|
+
"Bash(uv run mypy:*)",
|
|
9
|
+
"Bash(uv run python:*)",
|
|
10
|
+
"Bash(uv build:*)",
|
|
11
|
+
"Bash(tree:*)",
|
|
12
|
+
"Bash(python -m pytest tests/ -v)",
|
|
13
|
+
"Bash(python3 -m pytest:*)",
|
|
14
|
+
"Bash(.venv/bin/python -m pytest:*)",
|
|
15
|
+
"Bash(.venv/bin/python:*)"
|
|
16
|
+
]
|
|
17
|
+
}
|
|
18
|
+
}
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: athenaeum-kb
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Tools for intelligent interaction with knowledge bases
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: langchain-chroma>=0.2
|
|
8
|
+
Requires-Dist: langchain-core>=0.3
|
|
9
|
+
Requires-Dist: langchain-openai>=0.3
|
|
10
|
+
Requires-Dist: markitdown>=0.1
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
13
|
+
Provides-Extra: all-ocr
|
|
14
|
+
Requires-Dist: docling>=2.0; extra == 'all-ocr'
|
|
15
|
+
Requires-Dist: mistralai>=1.0; extra == 'all-ocr'
|
|
16
|
+
Requires-Dist: pillow>=10.0; extra == 'all-ocr'
|
|
17
|
+
Requires-Dist: torch>=2.0; extra == 'all-ocr'
|
|
18
|
+
Requires-Dist: transformers>=4.40; extra == 'all-ocr'
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Provides-Extra: docling
|
|
25
|
+
Requires-Dist: docling>=2.0; extra == 'docling'
|
|
26
|
+
Provides-Extra: lighton
|
|
27
|
+
Requires-Dist: pillow>=10.0; extra == 'lighton'
|
|
28
|
+
Requires-Dist: torch>=2.0; extra == 'lighton'
|
|
29
|
+
Requires-Dist: transformers>=4.40; extra == 'lighton'
|
|
30
|
+
Provides-Extra: mistral
|
|
31
|
+
Requires-Dist: mistralai>=1.0; extra == 'mistral'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Athenaeum
|
|
35
|
+
|
|
36
|
+
A Python library that equips AI agents with tools for intelligent interaction with knowledge bases. Athenaeum handles document ingestion, semantic search, and structured content access, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install athenaeum-kb
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Optional OCR backends
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install athenaeum-kb[docling] # Docling document converter
|
|
48
|
+
pip install athenaeum-kb[mistral] # Mistral cloud OCR (PDF only)
|
|
49
|
+
pip install athenaeum-kb[lighton] # LightOn local model (PDF + images)
|
|
50
|
+
pip install athenaeum-kb[all-ocr] # All OCR backends
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick start
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from langchain_openai import OpenAIEmbeddings
|
|
57
|
+
from athenaeum import Athenaeum, AthenaeumConfig
|
|
58
|
+
|
|
59
|
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
|
60
|
+
kb = Athenaeum(embeddings=embeddings)
|
|
61
|
+
|
|
62
|
+
# Load a document
|
|
63
|
+
doc_id = kb.load_doc("report.pdf")
|
|
64
|
+
|
|
65
|
+
# Search across all documents
|
|
66
|
+
hits = kb.search_docs("quarterly revenue", top_k=5)
|
|
67
|
+
|
|
68
|
+
# Search within a specific document
|
|
69
|
+
chunks = kb.search_doc_contents(doc_id, "executive summary")
|
|
70
|
+
|
|
71
|
+
# Read specific lines
|
|
72
|
+
excerpt = kb.read_doc(doc_id, start_line=1, end_line=50)
|
|
73
|
+
|
|
74
|
+
# List all loaded documents
|
|
75
|
+
docs = kb.list_docs()
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Tools
|
|
79
|
+
|
|
80
|
+
### `load_doc`
|
|
81
|
+
|
|
82
|
+
Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
load_doc(path: str, tags: set[str] | None = None) -> str
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Parameters:**
|
|
89
|
+
- `path`: Path to the document file
|
|
90
|
+
- `tags`: Optional set of tags to assign to the document
|
|
91
|
+
|
|
92
|
+
**Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
|
|
93
|
+
|
|
94
|
+
**Returns:** A document identifier (`doc_id`) for subsequent operations.
|
|
95
|
+
|
|
96
|
+
### `list_docs`
|
|
97
|
+
|
|
98
|
+
List all documents currently stored in the knowledge base.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
list_docs(tags: set[str] | None = None) -> list[SearchHit]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Parameters:**
|
|
105
|
+
- `tags`: Optional set of tags to filter by (OR semantics)
|
|
106
|
+
|
|
107
|
+
**Returns:** A list of documents with metadata (id, name, line count, table of contents, tags) and relevance scores.
|
|
108
|
+
|
|
109
|
+
### `search_docs`
|
|
110
|
+
|
|
111
|
+
Search across all documents in the knowledge base.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
search_docs(
|
|
115
|
+
query: str,
|
|
116
|
+
top_k: int = 10,
|
|
117
|
+
scope: Literal["names", "contents"] = "contents",
|
|
118
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
119
|
+
tags: set[str] | None = None,
|
|
120
|
+
) -> list[SearchHit]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
**Parameters:**
|
|
124
|
+
- `query`: Search query text
|
|
125
|
+
- `top_k`: Maximum number of results (default: 10)
|
|
126
|
+
- `tags`: Optional set of tags to filter by (OR semantics)
|
|
127
|
+
- `scope`: Where to search
|
|
128
|
+
- `"contents"`: Search within document contents (default)
|
|
129
|
+
- `"names"`: Search only document names
|
|
130
|
+
- `strategy`: Search strategy (only applies when scope is `"contents"`)
|
|
131
|
+
- `"hybrid"`: Combines vector and BM25 search (default)
|
|
132
|
+
- `"bm25"`: Keyword-based search only
|
|
133
|
+
- `"vector"`: Semantic similarity search only
|
|
134
|
+
|
|
135
|
+
**Returns:** A ranked list of `SearchHit` objects matching the query.
|
|
136
|
+
|
|
137
|
+
### `search_doc_contents`
|
|
138
|
+
|
|
139
|
+
Search within a specific document.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
search_doc_contents(
|
|
143
|
+
doc_id: str,
|
|
144
|
+
query: str,
|
|
145
|
+
top_k: int = 5,
|
|
146
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
147
|
+
) -> list[ContentSearchHit]
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Parameters:**
|
|
151
|
+
- `doc_id`: Document identifier
|
|
152
|
+
- `query`: Search query text
|
|
153
|
+
- `top_k`: Maximum number of results (default: 5)
|
|
154
|
+
- `strategy`: Search strategy (`"hybrid"`, `"bm25"`, or `"vector"`)
|
|
155
|
+
|
|
156
|
+
**Returns:** A list of matching content fragments with line ranges and relevance scores.
|
|
157
|
+
|
|
158
|
+
### `read_doc`
|
|
159
|
+
|
|
160
|
+
Read a specific range of lines from a document.
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
read_doc(
|
|
164
|
+
doc_id: str,
|
|
165
|
+
start_line: int = 1,
|
|
166
|
+
end_line: int = 100,
|
|
167
|
+
) -> Excerpt
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Parameters:**
|
|
171
|
+
- `doc_id`: Document identifier
|
|
172
|
+
- `start_line`: Starting line number (1-indexed, default: 1)
|
|
173
|
+
- `end_line`: Ending line number (1-indexed, inclusive, default: 100)
|
|
174
|
+
|
|
175
|
+
**Returns:** An `Excerpt` containing the requested lines.
|
|
176
|
+
|
|
177
|
+
## Configuration
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from pathlib import Path
|
|
181
|
+
from athenaeum import AthenaeumConfig
|
|
182
|
+
|
|
183
|
+
config = AthenaeumConfig(
|
|
184
|
+
storage_dir=Path.home() / ".athenaeum", # Where to store documents and indexes
|
|
185
|
+
chunk_size=80, # Lines per chunk
|
|
186
|
+
chunk_overlap=20, # Overlapping lines between chunks
|
|
187
|
+
rrf_k=60, # RRF constant for hybrid search
|
|
188
|
+
default_strategy="hybrid", # Default search strategy
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
kb = Athenaeum(embeddings=embeddings, config=config)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## OCR backends
|
|
195
|
+
|
|
196
|
+
Athenaeum supports multiple document-to-markdown converters:
|
|
197
|
+
|
|
198
|
+
| Backend | Formats | Notes |
|
|
199
|
+
|---------|---------|-------|
|
|
200
|
+
| **markitdown** (default) | PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB | Included in base install |
|
|
201
|
+
| **docling** | PDF, PPTX, DOCX, XLSX, HTML, MD | `pip install athenaeum-kb[docling]` |
|
|
202
|
+
| **mistral** | PDF | Cloud API, requires `MISTRAL_API_KEY` |
|
|
203
|
+
| **lighton** | PDF, PNG, JPG, JPEG, TIFF, BMP | Local transformer model, supports GPU |
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from athenaeum import Athenaeum, get_ocr_provider
|
|
207
|
+
|
|
208
|
+
ocr = get_ocr_provider("docling")
|
|
209
|
+
kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Custom OCR provider
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from athenaeum.ocr import CustomOCR
|
|
216
|
+
from pathlib import Path
|
|
217
|
+
|
|
218
|
+
def my_converter(file_path: Path) -> str:
|
|
219
|
+
return "markdown content"
|
|
220
|
+
|
|
221
|
+
ocr = CustomOCR(fn=my_converter, extensions={".custom"})
|
|
222
|
+
kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Tags
|
|
226
|
+
|
|
227
|
+
Documents can be tagged with free-form labels for filtering. Tags use OR semantics: filtering by `{"a", "b"}` returns documents tagged with either `a` or `b` (or both).
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
# Load with tags
|
|
231
|
+
doc_id = kb.load_doc("report.pdf", tags={"finance", "Q4"})
|
|
232
|
+
|
|
233
|
+
# Add/remove tags later
|
|
234
|
+
kb.tag_doc(doc_id, {"important"})
|
|
235
|
+
kb.untag_doc(doc_id, {"Q4"})
|
|
236
|
+
|
|
237
|
+
# List all tags in the knowledge base
|
|
238
|
+
all_tags = kb.list_tags() # {"finance", "important"}
|
|
239
|
+
|
|
240
|
+
# Filter list_docs by tags
|
|
241
|
+
finance_docs = kb.list_docs(tags={"finance"})
|
|
242
|
+
|
|
243
|
+
# Filter search_docs by tags
|
|
244
|
+
hits = kb.search_docs("revenue", tags={"finance"})
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### `tag_doc`
|
|
248
|
+
|
|
249
|
+
Add tags to an existing document.
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
tag_doc(doc_id: str, tags: set[str]) -> None
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### `untag_doc`
|
|
256
|
+
|
|
257
|
+
Remove tags from an existing document.
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
untag_doc(doc_id: str, tags: set[str]) -> None
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### `list_tags`
|
|
264
|
+
|
|
265
|
+
Return all tags across all documents.
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
list_tags() -> set[str]
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Search strategies
|
|
272
|
+
|
|
273
|
+
- **Hybrid** (default): Combines vector similarity and BM25 keyword search using Reciprocal Rank Fusion (RRF).
|
|
274
|
+
- **Vector**: Semantic similarity search via embeddings (Chroma-backed).
|
|
275
|
+
- **BM25**: Traditional keyword-based ranking using the BM25Okapi algorithm.
|
|
276
|
+
|
|
277
|
+
## Document ingestion workflow
|
|
278
|
+
|
|
279
|
+
When `load_doc(path)` is called:
|
|
280
|
+
|
|
281
|
+
1. **Validation** -- verify the file exists and the format is supported.
|
|
282
|
+
2. **Content extraction** -- convert the file to Markdown using the configured OCR backend.
|
|
283
|
+
3. **Pre-processing** -- generate metadata, extract a table of contents from headings, and chunk the Markdown with heading-aware boundary snapping.
|
|
284
|
+
4. **Indexing** -- generate vector embeddings and store them in Chroma; add chunks to the BM25 index.
|
|
285
|
+
|
|
286
|
+
## Data models
|
|
287
|
+
|
|
288
|
+
| Model | Description |
|
|
289
|
+
|-------|-------------|
|
|
290
|
+
| `Document` | Full document record (id, name, paths, line count, TOC, timestamps) |
|
|
291
|
+
| `SearchHit` | Document-level search result with score and snippet |
|
|
292
|
+
| `ContentSearchHit` | Within-document search result with line range and text |
|
|
293
|
+
| `Excerpt` | Text fragment from `read_doc` |
|
|
294
|
+
| `TOCEntry` | Table of contents entry (title, level, line range) |
|
|
295
|
+
| `ChunkMetadata` | Internal chunk metadata for indexing |
|
|
296
|
+
| `Metadata` | Lightweight id + name pair |
|
|
297
|
+
|
|
298
|
+
## Development
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
pip install athenaeum-kb[dev]
|
|
302
|
+
pytest
|
|
303
|
+
ruff check src/
|
|
304
|
+
mypy src/
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## License
|
|
308
|
+
|
|
309
|
+
MIT
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# Athenaeum
|
|
2
|
+
|
|
3
|
+
A Python library that equips AI agents with tools for intelligent interaction with knowledge bases. Athenaeum handles document ingestion, semantic search, and structured content access, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install athenaeum-kb
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
### Optional OCR backends
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install athenaeum-kb[docling] # Docling document converter
|
|
15
|
+
pip install athenaeum-kb[mistral] # Mistral cloud OCR (PDF only)
|
|
16
|
+
pip install athenaeum-kb[lighton] # LightOn local model (PDF + images)
|
|
17
|
+
pip install athenaeum-kb[all-ocr] # All OCR backends
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick start
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from langchain_openai import OpenAIEmbeddings
|
|
24
|
+
from athenaeum import Athenaeum, AthenaeumConfig
|
|
25
|
+
|
|
26
|
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
|
27
|
+
kb = Athenaeum(embeddings=embeddings)
|
|
28
|
+
|
|
29
|
+
# Load a document
|
|
30
|
+
doc_id = kb.load_doc("report.pdf")
|
|
31
|
+
|
|
32
|
+
# Search across all documents
|
|
33
|
+
hits = kb.search_docs("quarterly revenue", top_k=5)
|
|
34
|
+
|
|
35
|
+
# Search within a specific document
|
|
36
|
+
chunks = kb.search_doc_contents(doc_id, "executive summary")
|
|
37
|
+
|
|
38
|
+
# Read specific lines
|
|
39
|
+
excerpt = kb.read_doc(doc_id, start_line=1, end_line=50)
|
|
40
|
+
|
|
41
|
+
# List all loaded documents
|
|
42
|
+
docs = kb.list_docs()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Tools
|
|
46
|
+
|
|
47
|
+
### `load_doc`
|
|
48
|
+
|
|
49
|
+
Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
load_doc(path: str, tags: set[str] | None = None) -> str
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Parameters:**
|
|
56
|
+
- `path`: Path to the document file
|
|
57
|
+
- `tags`: Optional set of tags to assign to the document
|
|
58
|
+
|
|
59
|
+
**Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
|
|
60
|
+
|
|
61
|
+
**Returns:** A document identifier (`doc_id`) for subsequent operations.
|
|
62
|
+
|
|
63
|
+
### `list_docs`
|
|
64
|
+
|
|
65
|
+
List all documents currently stored in the knowledge base.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
list_docs(tags: set[str] | None = None) -> list[SearchHit]
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**Parameters:**
|
|
72
|
+
- `tags`: Optional set of tags to filter by (OR semantics)
|
|
73
|
+
|
|
74
|
+
**Returns:** A list of documents with metadata (id, name, line count, table of contents, tags) and relevance scores.
|
|
75
|
+
|
|
76
|
+
### `search_docs`
|
|
77
|
+
|
|
78
|
+
Search across all documents in the knowledge base.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
search_docs(
|
|
82
|
+
query: str,
|
|
83
|
+
top_k: int = 10,
|
|
84
|
+
scope: Literal["names", "contents"] = "contents",
|
|
85
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
86
|
+
tags: set[str] | None = None,
|
|
87
|
+
) -> list[SearchHit]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Parameters:**
|
|
91
|
+
- `query`: Search query text
|
|
92
|
+
- `top_k`: Maximum number of results (default: 10)
|
|
93
|
+
- `tags`: Optional set of tags to filter by (OR semantics)
|
|
94
|
+
- `scope`: Where to search
|
|
95
|
+
- `"contents"`: Search within document contents (default)
|
|
96
|
+
- `"names"`: Search only document names
|
|
97
|
+
- `strategy`: Search strategy (only applies when scope is `"contents"`)
|
|
98
|
+
- `"hybrid"`: Combines vector and BM25 search (default)
|
|
99
|
+
- `"bm25"`: Keyword-based search only
|
|
100
|
+
- `"vector"`: Semantic similarity search only
|
|
101
|
+
|
|
102
|
+
**Returns:** A ranked list of `SearchHit` objects matching the query.
|
|
103
|
+
|
|
104
|
+
### `search_doc_contents`
|
|
105
|
+
|
|
106
|
+
Search within a specific document.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
search_doc_contents(
|
|
110
|
+
doc_id: str,
|
|
111
|
+
query: str,
|
|
112
|
+
top_k: int = 5,
|
|
113
|
+
strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
|
|
114
|
+
) -> list[ContentSearchHit]
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Parameters:**
|
|
118
|
+
- `doc_id`: Document identifier
|
|
119
|
+
- `query`: Search query text
|
|
120
|
+
- `top_k`: Maximum number of results (default: 5)
|
|
121
|
+
- `strategy`: Search strategy (`"hybrid"`, `"bm25"`, or `"vector"`)
|
|
122
|
+
|
|
123
|
+
**Returns:** A list of matching content fragments with line ranges and relevance scores.
|
|
124
|
+
|
|
125
|
+
### `read_doc`
|
|
126
|
+
|
|
127
|
+
Read a specific range of lines from a document.
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
read_doc(
|
|
131
|
+
doc_id: str,
|
|
132
|
+
start_line: int = 1,
|
|
133
|
+
end_line: int = 100,
|
|
134
|
+
) -> Excerpt
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Parameters:**
|
|
138
|
+
- `doc_id`: Document identifier
|
|
139
|
+
- `start_line`: Starting line number (1-indexed, default: 1)
|
|
140
|
+
- `end_line`: Ending line number (1-indexed, inclusive, default: 100)
|
|
141
|
+
|
|
142
|
+
**Returns:** An `Excerpt` containing the requested lines.
|
|
143
|
+
|
|
144
|
+
## Configuration
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from pathlib import Path
|
|
148
|
+
from athenaeum import AthenaeumConfig
|
|
149
|
+
|
|
150
|
+
config = AthenaeumConfig(
|
|
151
|
+
storage_dir=Path.home() / ".athenaeum", # Where to store documents and indexes
|
|
152
|
+
chunk_size=80, # Lines per chunk
|
|
153
|
+
chunk_overlap=20, # Overlapping lines between chunks
|
|
154
|
+
rrf_k=60, # RRF constant for hybrid search
|
|
155
|
+
default_strategy="hybrid", # Default search strategy
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
kb = Athenaeum(embeddings=embeddings, config=config)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## OCR backends
|
|
162
|
+
|
|
163
|
+
Athenaeum supports multiple document-to-markdown converters:
|
|
164
|
+
|
|
165
|
+
| Backend | Formats | Notes |
|
|
166
|
+
|---------|---------|-------|
|
|
167
|
+
| **markitdown** (default) | PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB | Included in base install |
|
|
168
|
+
| **docling** | PDF, PPTX, DOCX, XLSX, HTML, MD | `pip install athenaeum-kb[docling]` |
|
|
169
|
+
| **mistral** | PDF | Cloud API, requires `MISTRAL_API_KEY` |
|
|
170
|
+
| **lighton** | PDF, PNG, JPG, JPEG, TIFF, BMP | Local transformer model, supports GPU |
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from athenaeum import Athenaeum, get_ocr_provider
|
|
174
|
+
|
|
175
|
+
ocr = get_ocr_provider("docling")
|
|
176
|
+
kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Custom OCR provider
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from athenaeum.ocr import CustomOCR
|
|
183
|
+
from pathlib import Path
|
|
184
|
+
|
|
185
|
+
def my_converter(file_path: Path) -> str:
|
|
186
|
+
return "markdown content"
|
|
187
|
+
|
|
188
|
+
ocr = CustomOCR(fn=my_converter, extensions={".custom"})
|
|
189
|
+
kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Tags
|
|
193
|
+
|
|
194
|
+
Documents can be tagged with free-form labels for filtering. Tags use OR semantics: filtering by `{"a", "b"}` returns documents tagged with either `a` or `b` (or both).
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
# Load with tags
|
|
198
|
+
doc_id = kb.load_doc("report.pdf", tags={"finance", "Q4"})
|
|
199
|
+
|
|
200
|
+
# Add/remove tags later
|
|
201
|
+
kb.tag_doc(doc_id, {"important"})
|
|
202
|
+
kb.untag_doc(doc_id, {"Q4"})
|
|
203
|
+
|
|
204
|
+
# List all tags in the knowledge base
|
|
205
|
+
all_tags = kb.list_tags() # {"finance", "important"}
|
|
206
|
+
|
|
207
|
+
# Filter list_docs by tags
|
|
208
|
+
finance_docs = kb.list_docs(tags={"finance"})
|
|
209
|
+
|
|
210
|
+
# Filter search_docs by tags
|
|
211
|
+
hits = kb.search_docs("revenue", tags={"finance"})
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### `tag_doc`
|
|
215
|
+
|
|
216
|
+
Add tags to an existing document.
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
tag_doc(doc_id: str, tags: set[str]) -> None
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### `untag_doc`
|
|
223
|
+
|
|
224
|
+
Remove tags from an existing document.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
untag_doc(doc_id: str, tags: set[str]) -> None
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### `list_tags`
|
|
231
|
+
|
|
232
|
+
Return all tags across all documents.
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
list_tags() -> set[str]
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Search strategies
|
|
239
|
+
|
|
240
|
+
- **Hybrid** (default): Combines vector similarity and BM25 keyword search using Reciprocal Rank Fusion (RRF).
|
|
241
|
+
- **Vector**: Semantic similarity search via embeddings (Chroma-backed).
|
|
242
|
+
- **BM25**: Traditional keyword-based ranking using the BM25Okapi algorithm.
|
|
243
|
+
|
|
244
|
+
## Document ingestion workflow
|
|
245
|
+
|
|
246
|
+
When `load_doc(path)` is called:
|
|
247
|
+
|
|
248
|
+
1. **Validation** -- verify the file exists and the format is supported.
|
|
249
|
+
2. **Content extraction** -- convert the file to Markdown using the configured OCR backend.
|
|
250
|
+
3. **Pre-processing** -- generate metadata, extract a table of contents from headings, and chunk the Markdown with heading-aware boundary snapping.
|
|
251
|
+
4. **Indexing** -- generate vector embeddings and store them in Chroma; add chunks to the BM25 index.
|
|
252
|
+
|
|
253
|
+
## Data models
|
|
254
|
+
|
|
255
|
+
| Model | Description |
|
|
256
|
+
|-------|-------------|
|
|
257
|
+
| `Document` | Full document record (id, name, paths, line count, TOC, timestamps) |
|
|
258
|
+
| `SearchHit` | Document-level search result with score and snippet |
|
|
259
|
+
| `ContentSearchHit` | Within-document search result with line range and text |
|
|
260
|
+
| `Excerpt` | Text fragment from `read_doc` |
|
|
261
|
+
| `TOCEntry` | Table of contents entry (title, level, line range) |
|
|
262
|
+
| `ChunkMetadata` | Internal chunk metadata for indexing |
|
|
263
|
+
| `Metadata` | Lightweight id + name pair |
|
|
264
|
+
|
|
265
|
+
## Development
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
pip install athenaeum-kb[dev]
|
|
269
|
+
pytest
|
|
270
|
+
ruff check src/
|
|
271
|
+
mypy src/
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## License
|
|
275
|
+
|
|
276
|
+
MIT
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "athenaeum-kb"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Tools for intelligent interaction with knowledge bases"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"pydantic>=2.0",
|
|
14
|
+
"langchain-core>=0.3",
|
|
15
|
+
"langchain-chroma>=0.2",
|
|
16
|
+
"langchain-openai>=0.3",
|
|
17
|
+
"rank-bm25>=0.2.2",
|
|
18
|
+
"markitdown>=0.1",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
docling = ["docling>=2.0"]
|
|
23
|
+
mistral = ["mistralai>=1.0"]
|
|
24
|
+
lighton = ["transformers>=4.40", "torch>=2.0", "Pillow>=10.0"]
|
|
25
|
+
all-ocr = [
|
|
26
|
+
"athenaeum-kb[docling]",
|
|
27
|
+
"athenaeum-kb[mistral]",
|
|
28
|
+
"athenaeum-kb[lighton]",
|
|
29
|
+
]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=8.0",
|
|
32
|
+
"pytest-asyncio>=0.23",
|
|
33
|
+
"ruff>=0.4",
|
|
34
|
+
"mypy>=1.10",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.hatch.build.targets.wheel]
|
|
38
|
+
packages = ["src/athenaeum"]
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
target-version = "py311"
|
|
42
|
+
line-length = 99
|
|
43
|
+
|
|
44
|
+
[tool.ruff.lint]
|
|
45
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
46
|
+
|
|
47
|
+
[tool.mypy]
|
|
48
|
+
python_version = "3.11"
|
|
49
|
+
strict = true
|
|
50
|
+
warn_return_any = true
|
|
51
|
+
warn_unused_configs = true
|
|
52
|
+
|
|
53
|
+
[[tool.mypy.overrides]]
|
|
54
|
+
module = [
|
|
55
|
+
"rank_bm25",
|
|
56
|
+
"rank_bm25.*",
|
|
57
|
+
"mistralai",
|
|
58
|
+
"mistralai.*",
|
|
59
|
+
"transformers",
|
|
60
|
+
"transformers.*",
|
|
61
|
+
"PIL",
|
|
62
|
+
"PIL.*",
|
|
63
|
+
"docling",
|
|
64
|
+
"docling.*",
|
|
65
|
+
]
|
|
66
|
+
ignore_missing_imports = true
|
|
67
|
+
|
|
68
|
+
[tool.pytest.ini_options]
|
|
69
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Athenaeum - Tools for intelligent interaction with knowledge bases."""
|
|
2
|
+
|
|
3
|
+
from athenaeum.athenaeum import Athenaeum
|
|
4
|
+
from athenaeum.config import AthenaeumConfig
|
|
5
|
+
from athenaeum.models import (
|
|
6
|
+
ChunkMetadata,
|
|
7
|
+
ContentSearchHit,
|
|
8
|
+
Document,
|
|
9
|
+
Excerpt,
|
|
10
|
+
Metadata,
|
|
11
|
+
SearchHit,
|
|
12
|
+
TOCEntry,
|
|
13
|
+
)
|
|
14
|
+
from athenaeum.ocr import OCRProvider, get_ocr_provider
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Athenaeum",
|
|
18
|
+
"AthenaeumConfig",
|
|
19
|
+
"ChunkMetadata",
|
|
20
|
+
"ContentSearchHit",
|
|
21
|
+
"Document",
|
|
22
|
+
"Excerpt",
|
|
23
|
+
"Metadata",
|
|
24
|
+
"OCRProvider",
|
|
25
|
+
"SearchHit",
|
|
26
|
+
"TOCEntry",
|
|
27
|
+
"get_ocr_provider",
|
|
28
|
+
]
|