athenaeum-kb 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. athenaeum_kb-0.2.0/.claude/settings.local.json +18 -0
  2. athenaeum_kb-0.2.0/PKG-INFO +309 -0
  3. athenaeum_kb-0.2.0/README.md +276 -0
  4. athenaeum_kb-0.2.0/pyproject.toml +69 -0
  5. athenaeum_kb-0.2.0/src/athenaeum/__init__.py +28 -0
  6. athenaeum_kb-0.2.0/src/athenaeum/athenaeum.py +339 -0
  7. athenaeum_kb-0.2.0/src/athenaeum/chunker.py +77 -0
  8. athenaeum_kb-0.2.0/src/athenaeum/config.py +16 -0
  9. athenaeum_kb-0.2.0/src/athenaeum/document_store.py +64 -0
  10. athenaeum_kb-0.2.0/src/athenaeum/models.py +90 -0
  11. athenaeum_kb-0.2.0/src/athenaeum/ocr/__init__.py +48 -0
  12. athenaeum_kb-0.2.0/src/athenaeum/ocr/base.py +28 -0
  13. athenaeum_kb-0.2.0/src/athenaeum/ocr/custom.py +26 -0
  14. athenaeum_kb-0.2.0/src/athenaeum/ocr/docling.py +30 -0
  15. athenaeum_kb-0.2.0/src/athenaeum/ocr/lighton.py +40 -0
  16. athenaeum_kb-0.2.0/src/athenaeum/ocr/markitdown.py +29 -0
  17. athenaeum_kb-0.2.0/src/athenaeum/ocr/mistral.py +44 -0
  18. athenaeum_kb-0.2.0/src/athenaeum/search/__init__.py +7 -0
  19. athenaeum_kb-0.2.0/src/athenaeum/search/bm25.py +75 -0
  20. athenaeum_kb-0.2.0/src/athenaeum/search/hybrid.py +37 -0
  21. athenaeum_kb-0.2.0/src/athenaeum/search/vector.py +75 -0
  22. athenaeum_kb-0.2.0/src/athenaeum/storage.py +68 -0
  23. athenaeum_kb-0.2.0/src/athenaeum/toc.py +49 -0
  24. athenaeum_kb-0.2.0/tests/__init__.py +0 -0
  25. athenaeum_kb-0.2.0/tests/conftest.py +24 -0
  26. athenaeum_kb-0.2.0/tests/fixtures/sample.md +34 -0
  27. athenaeum_kb-0.2.0/tests/fixtures/sample.txt +5 -0
  28. athenaeum_kb-0.2.0/tests/test_athenaeum.py +192 -0
  29. athenaeum_kb-0.2.0/tests/test_bm25.py +59 -0
  30. athenaeum_kb-0.2.0/tests/test_chunker.py +45 -0
  31. athenaeum_kb-0.2.0/tests/test_hybrid.py +38 -0
  32. athenaeum_kb-0.2.0/tests/test_models.py +72 -0
  33. athenaeum_kb-0.2.0/tests/test_ocr_base.py +61 -0
  34. athenaeum_kb-0.2.0/tests/test_storage.py +59 -0
  35. athenaeum_kb-0.2.0/tests/test_toc.py +43 -0
  36. athenaeum_kb-0.2.0/tests/test_vector.py +87 -0
  37. athenaeum_kb-0.2.0/uv.lock +4487 -0
@@ -0,0 +1,18 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "WebSearch",
5
+ "Bash(uv sync:*)",
6
+ "Bash(uv run pytest:*)",
7
+ "Bash(uv run ruff check:*)",
8
+ "Bash(uv run mypy:*)",
9
+ "Bash(uv run python:*)",
10
+ "Bash(uv build:*)",
11
+ "Bash(tree:*)",
12
+ "Bash(python -m pytest tests/ -v)",
13
+ "Bash(python3 -m pytest:*)",
14
+ "Bash(.venv/bin/python -m pytest:*)",
15
+ "Bash(.venv/bin/python:*)"
16
+ ]
17
+ }
18
+ }
@@ -0,0 +1,309 @@
1
+ Metadata-Version: 2.4
2
+ Name: athenaeum-kb
3
+ Version: 0.2.0
4
+ Summary: Tools for intelligent interaction with knowledge bases
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: langchain-chroma>=0.2
8
+ Requires-Dist: langchain-core>=0.3
9
+ Requires-Dist: langchain-openai>=0.3
10
+ Requires-Dist: markitdown>=0.1
11
+ Requires-Dist: pydantic>=2.0
12
+ Requires-Dist: rank-bm25>=0.2.2
13
+ Provides-Extra: all-ocr
14
+ Requires-Dist: docling>=2.0; extra == 'all-ocr'
15
+ Requires-Dist: mistralai>=1.0; extra == 'all-ocr'
16
+ Requires-Dist: pillow>=10.0; extra == 'all-ocr'
17
+ Requires-Dist: torch>=2.0; extra == 'all-ocr'
18
+ Requires-Dist: transformers>=4.40; extra == 'all-ocr'
19
+ Provides-Extra: dev
20
+ Requires-Dist: mypy>=1.10; extra == 'dev'
21
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Requires-Dist: ruff>=0.4; extra == 'dev'
24
+ Provides-Extra: docling
25
+ Requires-Dist: docling>=2.0; extra == 'docling'
26
+ Provides-Extra: lighton
27
+ Requires-Dist: pillow>=10.0; extra == 'lighton'
28
+ Requires-Dist: torch>=2.0; extra == 'lighton'
29
+ Requires-Dist: transformers>=4.40; extra == 'lighton'
30
+ Provides-Extra: mistral
31
+ Requires-Dist: mistralai>=1.0; extra == 'mistral'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Athenaeum
35
+
36
+ A Python library that equips AI agents with tools for intelligent interaction with knowledge bases. Athenaeum handles document ingestion, semantic search, and structured content access, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install athenaeum-kb
42
+ ```
43
+
44
+ ### Optional OCR backends
45
+
46
+ ```bash
47
+ pip install athenaeum-kb[docling] # Docling document converter
48
+ pip install athenaeum-kb[mistral] # Mistral cloud OCR (PDF only)
49
+ pip install athenaeum-kb[lighton] # LightOn local model (PDF + images)
50
+ pip install athenaeum-kb[all-ocr] # All OCR backends
51
+ ```
52
+
53
+ ## Quick start
54
+
55
+ ```python
56
+ from langchain_openai import OpenAIEmbeddings
57
+ from athenaeum import Athenaeum, AthenaeumConfig
58
+
59
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
60
+ kb = Athenaeum(embeddings=embeddings)
61
+
62
+ # Load a document
63
+ doc_id = kb.load_doc("report.pdf")
64
+
65
+ # Search across all documents
66
+ hits = kb.search_docs("quarterly revenue", top_k=5)
67
+
68
+ # Search within a specific document
69
+ chunks = kb.search_doc_contents(doc_id, "executive summary")
70
+
71
+ # Read specific lines
72
+ excerpt = kb.read_doc(doc_id, start_line=1, end_line=50)
73
+
74
+ # List all loaded documents
75
+ docs = kb.list_docs()
76
+ ```
77
+
78
+ ## Tools
79
+
80
+ ### `load_doc`
81
+
82
+ Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
83
+
84
+ ```python
85
+ load_doc(path: str, tags: set[str] | None = None) -> str
86
+ ```
87
+
88
+ **Parameters:**
89
+ - `path`: Path to the document file
90
+ - `tags`: Optional set of tags to assign to the document
91
+
92
+ **Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
93
+
94
+ **Returns:** A document identifier (`doc_id`) for subsequent operations.
95
+
96
+ ### `list_docs`
97
+
98
+ List all documents currently stored in the knowledge base.
99
+
100
+ ```python
101
+ list_docs(tags: set[str] | None = None) -> list[SearchHit]
102
+ ```
103
+
104
+ **Parameters:**
105
+ - `tags`: Optional set of tags to filter by (OR semantics)
106
+
107
+ **Returns:** A list of documents with metadata (id, name, line count, table of contents, tags) and relevance scores.
108
+
109
+ ### `search_docs`
110
+
111
+ Search across all documents in the knowledge base.
112
+
113
+ ```python
114
+ search_docs(
115
+ query: str,
116
+ top_k: int = 10,
117
+ scope: Literal["names", "contents"] = "contents",
118
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
119
+ tags: set[str] | None = None,
120
+ ) -> list[SearchHit]
121
+ ```
122
+
123
+ **Parameters:**
124
+ - `query`: Search query text
125
+ - `top_k`: Maximum number of results (default: 10)
126
+ - `tags`: Optional set of tags to filter by (OR semantics)
127
+ - `scope`: Where to search
128
+ - `"contents"`: Search within document contents (default)
129
+ - `"names"`: Search only document names
130
+ - `strategy`: Search strategy (only applies when scope is `"contents"`)
131
+ - `"hybrid"`: Combines vector and BM25 search (default)
132
+ - `"bm25"`: Keyword-based search only
133
+ - `"vector"`: Semantic similarity search only
134
+
135
+ **Returns:** A ranked list of `SearchHit` objects matching the query.
136
+
137
+ ### `search_doc_contents`
138
+
139
+ Search within a specific document.
140
+
141
+ ```python
142
+ search_doc_contents(
143
+ doc_id: str,
144
+ query: str,
145
+ top_k: int = 5,
146
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
147
+ ) -> list[ContentSearchHit]
148
+ ```
149
+
150
+ **Parameters:**
151
+ - `doc_id`: Document identifier
152
+ - `query`: Search query text
153
+ - `top_k`: Maximum number of results (default: 5)
154
+ - `strategy`: Search strategy (`"hybrid"`, `"bm25"`, or `"vector"`)
155
+
156
+ **Returns:** A list of matching content fragments with line ranges and relevance scores.
157
+
158
+ ### `read_doc`
159
+
160
+ Read a specific range of lines from a document.
161
+
162
+ ```python
163
+ read_doc(
164
+ doc_id: str,
165
+ start_line: int = 1,
166
+ end_line: int = 100,
167
+ ) -> Excerpt
168
+ ```
169
+
170
+ **Parameters:**
171
+ - `doc_id`: Document identifier
172
+ - `start_line`: Starting line number (1-indexed, default: 1)
173
+ - `end_line`: Ending line number (1-indexed, inclusive, default: 100)
174
+
175
+ **Returns:** An `Excerpt` containing the requested lines.
176
+
177
+ ## Configuration
178
+
179
+ ```python
180
+ from pathlib import Path
181
+ from athenaeum import AthenaeumConfig
182
+
183
+ config = AthenaeumConfig(
184
+ storage_dir=Path.home() / ".athenaeum", # Where to store documents and indexes
185
+ chunk_size=80, # Lines per chunk
186
+ chunk_overlap=20, # Overlapping lines between chunks
187
+ rrf_k=60, # RRF constant for hybrid search
188
+ default_strategy="hybrid", # Default search strategy
189
+ )
190
+
191
+ kb = Athenaeum(embeddings=embeddings, config=config)
192
+ ```
193
+
194
+ ## OCR backends
195
+
196
+ Athenaeum supports multiple document-to-markdown converters:
197
+
198
+ | Backend | Formats | Notes |
199
+ |---------|---------|-------|
200
+ | **markitdown** (default) | PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB | Included in base install |
201
+ | **docling** | PDF, PPTX, DOCX, XLSX, HTML, MD | `pip install athenaeum-kb[docling]` |
202
+ | **mistral** | PDF | Cloud API, requires `MISTRAL_API_KEY` |
203
+ | **lighton** | PDF, PNG, JPG, JPEG, TIFF, BMP | Local transformer model, supports GPU |
204
+
205
+ ```python
206
+ from athenaeum import Athenaeum, get_ocr_provider
207
+
208
+ ocr = get_ocr_provider("docling")
209
+ kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
210
+ ```
211
+
212
+ ### Custom OCR provider
213
+
214
+ ```python
215
+ from athenaeum.ocr import CustomOCR
216
+ from pathlib import Path
217
+
218
+ def my_converter(file_path: Path) -> str:
219
+ return "markdown content"
220
+
221
+ ocr = CustomOCR(fn=my_converter, extensions={".custom"})
222
+ kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
223
+ ```
224
+
225
+ ## Tags
226
+
227
+ Documents can be tagged with free-form labels for filtering. Tags use OR semantics: filtering by `{"a", "b"}` returns documents tagged with either `a` or `b` (or both).
228
+
229
+ ```python
230
+ # Load with tags
231
+ doc_id = kb.load_doc("report.pdf", tags={"finance", "Q4"})
232
+
233
+ # Add/remove tags later
234
+ kb.tag_doc(doc_id, {"important"})
235
+ kb.untag_doc(doc_id, {"Q4"})
236
+
237
+ # List all tags in the knowledge base
238
+ all_tags = kb.list_tags() # {"finance", "important"}
239
+
240
+ # Filter list_docs by tags
241
+ finance_docs = kb.list_docs(tags={"finance"})
242
+
243
+ # Filter search_docs by tags
244
+ hits = kb.search_docs("revenue", tags={"finance"})
245
+ ```
246
+
247
+ ### `tag_doc`
248
+
249
+ Add tags to an existing document.
250
+
251
+ ```python
252
+ tag_doc(doc_id: str, tags: set[str]) -> None
253
+ ```
254
+
255
+ ### `untag_doc`
256
+
257
+ Remove tags from an existing document.
258
+
259
+ ```python
260
+ untag_doc(doc_id: str, tags: set[str]) -> None
261
+ ```
262
+
263
+ ### `list_tags`
264
+
265
+ Return all tags across all documents.
266
+
267
+ ```python
268
+ list_tags() -> set[str]
269
+ ```
270
+
271
+ ## Search strategies
272
+
273
+ - **Hybrid** (default): Combines vector similarity and BM25 keyword search using Reciprocal Rank Fusion (RRF).
274
+ - **Vector**: Semantic similarity search via embeddings (Chroma-backed).
275
+ - **BM25**: Traditional keyword-based ranking using the BM25Okapi algorithm.
276
+
277
+ ## Document ingestion workflow
278
+
279
+ When `load_doc(path)` is called:
280
+
281
+ 1. **Validation** -- verify the file exists and the format is supported.
282
+ 2. **Content extraction** -- convert the file to Markdown using the configured OCR backend.
283
+ 3. **Pre-processing** -- generate metadata, extract a table of contents from headings, and chunk the Markdown with heading-aware boundary snapping.
284
+ 4. **Indexing** -- generate vector embeddings and store them in Chroma; add chunks to the BM25 index.
285
+
286
+ ## Data models
287
+
288
+ | Model | Description |
289
+ |-------|-------------|
290
+ | `Document` | Full document record (id, name, paths, line count, TOC, timestamps) |
291
+ | `SearchHit` | Document-level search result with score and snippet |
292
+ | `ContentSearchHit` | Within-document search result with line range and text |
293
+ | `Excerpt` | Text fragment from `read_doc` |
294
+ | `TOCEntry` | Table of contents entry (title, level, line range) |
295
+ | `ChunkMetadata` | Internal chunk metadata for indexing |
296
+ | `Metadata` | Lightweight id + name pair |
297
+
298
+ ## Development
299
+
300
+ ```bash
301
+ pip install athenaeum-kb[dev]
302
+ pytest
303
+ ruff check src/
304
+ mypy src/
305
+ ```
306
+
307
+ ## License
308
+
309
+ MIT
@@ -0,0 +1,276 @@
1
+ # Athenaeum
2
+
3
+ A Python library that equips AI agents with tools for intelligent interaction with knowledge bases. Athenaeum handles document ingestion, semantic search, and structured content access, making it suitable for agent-based systems, RAG pipelines, and automation workflows.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install athenaeum-kb
9
+ ```
10
+
11
+ ### Optional OCR backends
12
+
13
+ ```bash
14
+ pip install athenaeum-kb[docling] # Docling document converter
15
+ pip install athenaeum-kb[mistral] # Mistral cloud OCR (PDF only)
16
+ pip install athenaeum-kb[lighton] # LightOn local model (PDF + images)
17
+ pip install athenaeum-kb[all-ocr] # All OCR backends
18
+ ```
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from langchain_openai import OpenAIEmbeddings
24
+ from athenaeum import Athenaeum, AthenaeumConfig
25
+
26
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
27
+ kb = Athenaeum(embeddings=embeddings)
28
+
29
+ # Load a document
30
+ doc_id = kb.load_doc("report.pdf")
31
+
32
+ # Search across all documents
33
+ hits = kb.search_docs("quarterly revenue", top_k=5)
34
+
35
+ # Search within a specific document
36
+ chunks = kb.search_doc_contents(doc_id, "executive summary")
37
+
38
+ # Read specific lines
39
+ excerpt = kb.read_doc(doc_id, start_line=1, end_line=50)
40
+
41
+ # List all loaded documents
42
+ docs = kb.list_docs()
43
+ ```
44
+
45
+ ## Tools
46
+
47
+ ### `load_doc`
48
+
49
+ Load a document into the knowledge base, automatically extracting content, metadata, and embeddings.
50
+
51
+ ```python
52
+ load_doc(path: str, tags: set[str] | None = None) -> str
53
+ ```
54
+
55
+ **Parameters:**
56
+ - `path`: Path to the document file
57
+ - `tags`: Optional set of tags to assign to the document
58
+
59
+ **Supported formats:** PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB
60
+
61
+ **Returns:** A document identifier (`doc_id`) for subsequent operations.
62
+
63
+ ### `list_docs`
64
+
65
+ List all documents currently stored in the knowledge base.
66
+
67
+ ```python
68
+ list_docs(tags: set[str] | None = None) -> list[SearchHit]
69
+ ```
70
+
71
+ **Parameters:**
72
+ - `tags`: Optional set of tags to filter by (OR semantics)
73
+
74
+ **Returns:** A list of documents with metadata (id, name, line count, table of contents, tags) and relevance scores.
75
+
76
+ ### `search_docs`
77
+
78
+ Search across all documents in the knowledge base.
79
+
80
+ ```python
81
+ search_docs(
82
+ query: str,
83
+ top_k: int = 10,
84
+ scope: Literal["names", "contents"] = "contents",
85
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
86
+ tags: set[str] | None = None,
87
+ ) -> list[SearchHit]
88
+ ```
89
+
90
+ **Parameters:**
91
+ - `query`: Search query text
92
+ - `top_k`: Maximum number of results (default: 10)
93
+ - `tags`: Optional set of tags to filter by (OR semantics)
94
+ - `scope`: Where to search
95
+ - `"contents"`: Search within document contents (default)
96
+ - `"names"`: Search only document names
97
+ - `strategy`: Search strategy (only applies when scope is `"contents"`)
98
+ - `"hybrid"`: Combines vector and BM25 search (default)
99
+ - `"bm25"`: Keyword-based search only
100
+ - `"vector"`: Semantic similarity search only
101
+
102
+ **Returns:** A ranked list of `SearchHit` objects matching the query.
103
+
104
+ ### `search_doc_contents`
105
+
106
+ Search within a specific document.
107
+
108
+ ```python
109
+ search_doc_contents(
110
+ doc_id: str,
111
+ query: str,
112
+ top_k: int = 5,
113
+ strategy: Literal["hybrid", "bm25", "vector"] = "hybrid",
114
+ ) -> list[ContentSearchHit]
115
+ ```
116
+
117
+ **Parameters:**
118
+ - `doc_id`: Document identifier
119
+ - `query`: Search query text
120
+ - `top_k`: Maximum number of results (default: 5)
121
+ - `strategy`: Search strategy (`"hybrid"`, `"bm25"`, or `"vector"`)
122
+
123
+ **Returns:** A list of matching content fragments with line ranges and relevance scores.
124
+
125
+ ### `read_doc`
126
+
127
+ Read a specific range of lines from a document.
128
+
129
+ ```python
130
+ read_doc(
131
+ doc_id: str,
132
+ start_line: int = 1,
133
+ end_line: int = 100,
134
+ ) -> Excerpt
135
+ ```
136
+
137
+ **Parameters:**
138
+ - `doc_id`: Document identifier
139
+ - `start_line`: Starting line number (1-indexed, default: 1)
140
+ - `end_line`: Ending line number (1-indexed, inclusive, default: 100)
141
+
142
+ **Returns:** An `Excerpt` containing the requested lines.
143
+
144
+ ## Configuration
145
+
146
+ ```python
147
+ from pathlib import Path
148
+ from athenaeum import AthenaeumConfig
149
+
150
+ config = AthenaeumConfig(
151
+ storage_dir=Path.home() / ".athenaeum", # Where to store documents and indexes
152
+ chunk_size=80, # Lines per chunk
153
+ chunk_overlap=20, # Overlapping lines between chunks
154
+ rrf_k=60, # RRF constant for hybrid search
155
+ default_strategy="hybrid", # Default search strategy
156
+ )
157
+
158
+ kb = Athenaeum(embeddings=embeddings, config=config)
159
+ ```
160
+
161
+ ## OCR backends
162
+
163
+ Athenaeum supports multiple document-to-markdown converters:
164
+
165
+ | Backend | Formats | Notes |
166
+ |---------|---------|-------|
167
+ | **markitdown** (default) | PDF, PPTX, DOCX, XLSX, JSON, CSV, TXT, MD, HTML, XML, RTF, EPUB | Included in base install |
168
+ | **docling** | PDF, PPTX, DOCX, XLSX, HTML, MD | `pip install athenaeum-kb[docling]` |
169
+ | **mistral** | PDF | Cloud API, requires `MISTRAL_API_KEY` |
170
+ | **lighton** | PDF, PNG, JPG, JPEG, TIFF, BMP | Local transformer model, supports GPU |
171
+
172
+ ```python
173
+ from athenaeum import Athenaeum, get_ocr_provider
174
+
175
+ ocr = get_ocr_provider("docling")
176
+ kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
177
+ ```
178
+
179
+ ### Custom OCR provider
180
+
181
+ ```python
182
+ from athenaeum.ocr import CustomOCR
183
+ from pathlib import Path
184
+
185
+ def my_converter(file_path: Path) -> str:
186
+ return "markdown content"
187
+
188
+ ocr = CustomOCR(fn=my_converter, extensions={".custom"})
189
+ kb = Athenaeum(embeddings=embeddings, ocr_provider=ocr)
190
+ ```
191
+
192
+ ## Tags
193
+
194
+ Documents can be tagged with free-form labels for filtering. Tags use OR semantics: filtering by `{"a", "b"}` returns documents tagged with either `a` or `b` (or both).
195
+
196
+ ```python
197
+ # Load with tags
198
+ doc_id = kb.load_doc("report.pdf", tags={"finance", "Q4"})
199
+
200
+ # Add/remove tags later
201
+ kb.tag_doc(doc_id, {"important"})
202
+ kb.untag_doc(doc_id, {"Q4"})
203
+
204
+ # List all tags in the knowledge base
205
+ all_tags = kb.list_tags() # {"finance", "important"}
206
+
207
+ # Filter list_docs by tags
208
+ finance_docs = kb.list_docs(tags={"finance"})
209
+
210
+ # Filter search_docs by tags
211
+ hits = kb.search_docs("revenue", tags={"finance"})
212
+ ```
213
+
214
+ ### `tag_doc`
215
+
216
+ Add tags to an existing document.
217
+
218
+ ```python
219
+ tag_doc(doc_id: str, tags: set[str]) -> None
220
+ ```
221
+
222
+ ### `untag_doc`
223
+
224
+ Remove tags from an existing document.
225
+
226
+ ```python
227
+ untag_doc(doc_id: str, tags: set[str]) -> None
228
+ ```
229
+
230
+ ### `list_tags`
231
+
232
+ Return all tags across all documents.
233
+
234
+ ```python
235
+ list_tags() -> set[str]
236
+ ```
237
+
238
+ ## Search strategies
239
+
240
+ - **Hybrid** (default): Combines vector similarity and BM25 keyword search using Reciprocal Rank Fusion (RRF).
241
+ - **Vector**: Semantic similarity search via embeddings (Chroma-backed).
242
+ - **BM25**: Traditional keyword-based ranking using the BM25Okapi algorithm.
243
+
244
+ ## Document ingestion workflow
245
+
246
+ When `load_doc(path)` is called:
247
+
248
+ 1. **Validation** -- verify the file exists and the format is supported.
249
+ 2. **Content extraction** -- convert the file to Markdown using the configured OCR backend.
250
+ 3. **Pre-processing** -- generate metadata, extract a table of contents from headings, and chunk the Markdown with heading-aware boundary snapping.
251
+ 4. **Indexing** -- generate vector embeddings and store them in Chroma; add chunks to the BM25 index.
252
+
253
+ ## Data models
254
+
255
+ | Model | Description |
256
+ |-------|-------------|
257
+ | `Document` | Full document record (id, name, paths, line count, TOC, timestamps) |
258
+ | `SearchHit` | Document-level search result with score and snippet |
259
+ | `ContentSearchHit` | Within-document search result with line range and text |
260
+ | `Excerpt` | Text fragment from `read_doc` |
261
+ | `TOCEntry` | Table of contents entry (title, level, line range) |
262
+ | `ChunkMetadata` | Internal chunk metadata for indexing |
263
+ | `Metadata` | Lightweight id + name pair |
264
+
265
+ ## Development
266
+
267
+ ```bash
268
+ pip install athenaeum-kb[dev]
269
+ pytest
270
+ ruff check src/
271
+ mypy src/
272
+ ```
273
+
274
+ ## License
275
+
276
+ MIT
@@ -0,0 +1,69 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "athenaeum-kb"
7
+ version = "0.2.0"
8
+ description = "Tools for intelligent interaction with knowledge bases"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
+ dependencies = [
13
+ "pydantic>=2.0",
14
+ "langchain-core>=0.3",
15
+ "langchain-chroma>=0.2",
16
+ "langchain-openai>=0.3",
17
+ "rank-bm25>=0.2.2",
18
+ "markitdown>=0.1",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ docling = ["docling>=2.0"]
23
+ mistral = ["mistralai>=1.0"]
24
+ lighton = ["transformers>=4.40", "torch>=2.0", "Pillow>=10.0"]
25
+ all-ocr = [
26
+ "athenaeum-kb[docling]",
27
+ "athenaeum-kb[mistral]",
28
+ "athenaeum-kb[lighton]",
29
+ ]
30
+ dev = [
31
+ "pytest>=8.0",
32
+ "pytest-asyncio>=0.23",
33
+ "ruff>=0.4",
34
+ "mypy>=1.10",
35
+ ]
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/athenaeum"]
39
+
40
+ [tool.ruff]
41
+ target-version = "py311"
42
+ line-length = 99
43
+
44
+ [tool.ruff.lint]
45
+ select = ["E", "F", "I", "UP", "B", "SIM"]
46
+
47
+ [tool.mypy]
48
+ python_version = "3.11"
49
+ strict = true
50
+ warn_return_any = true
51
+ warn_unused_configs = true
52
+
53
+ [[tool.mypy.overrides]]
54
+ module = [
55
+ "rank_bm25",
56
+ "rank_bm25.*",
57
+ "mistralai",
58
+ "mistralai.*",
59
+ "transformers",
60
+ "transformers.*",
61
+ "PIL",
62
+ "PIL.*",
63
+ "docling",
64
+ "docling.*",
65
+ ]
66
+ ignore_missing_imports = true
67
+
68
+ [tool.pytest.ini_options]
69
+ testpaths = ["tests"]
@@ -0,0 +1,28 @@
1
+ """Athenaeum - Tools for intelligent interaction with knowledge bases."""
2
+
3
+ from athenaeum.athenaeum import Athenaeum
4
+ from athenaeum.config import AthenaeumConfig
5
+ from athenaeum.models import (
6
+ ChunkMetadata,
7
+ ContentSearchHit,
8
+ Document,
9
+ Excerpt,
10
+ Metadata,
11
+ SearchHit,
12
+ TOCEntry,
13
+ )
14
+ from athenaeum.ocr import OCRProvider, get_ocr_provider
15
+
16
+ __all__ = [
17
+ "Athenaeum",
18
+ "AthenaeumConfig",
19
+ "ChunkMetadata",
20
+ "ContentSearchHit",
21
+ "Document",
22
+ "Excerpt",
23
+ "Metadata",
24
+ "OCRProvider",
25
+ "SearchHit",
26
+ "TOCEntry",
27
+ "get_ocr_provider",
28
+ ]