haiku.rag 0.8.0__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/PKG-INFO +1 -1
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/benchmarks.md +2 -2
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/cli.md +4 -2
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/configuration.md +32 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/python.md +8 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/pyproject.toml +2 -1
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/config.py +4 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/engine.py +15 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/chunk.py +24 -1
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/document.py +48 -28
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/utils.py +54 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/generate_benchmark_db.py +3 -1
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_client.py +91 -95
- haiku_rag-0.8.1/tests/test_preprocessor.py +71 -0
- haiku_rag-0.8.1/tests/test_versioning.py +94 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/uv.lock +175 -1
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.gitignore +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.python-version +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/LICENSE +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/README.md +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/index.md +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/installation.md +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/mcp.md +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/server.md +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/mkdocs.yml +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/app.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/chunker.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/cli.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/client.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/vllm.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/logging.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/migration.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/agent.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reader.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/vllm.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/settings.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/__init__.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/conftest.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/llm_judge.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_app.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_chunk.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_chunker.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_cli.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_document.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_embedder.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_lancedb_connection.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_monitor.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_qa.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_reader.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_reranker.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_search.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_settings.py +0 -0
- {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_utils.py +0 -0
|
@@ -16,8 +16,8 @@ The recall obtained is ~0.79 for matching in the top result, raising to ~0.91 fo
|
|
|
16
16
|
|---------------------------------------|-------------------|-------------------|------------------------|
|
|
17
17
|
| Ollama / `mxbai-embed-large` | 0.79 | 0.91 | None |
|
|
18
18
|
| Ollama / `mxbai-embed-large` | 0.90 | 0.95 | `mxbai-rerank-base-v2` |
|
|
19
|
-
|
|
20
|
-
| OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
19
|
+
| Ollama / `nomic-embed-text-v1.5` | 0.74 | 0.90 | None |
|
|
20
|
+
<!-- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
21
21
|
| OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
22
22
|
| OpenAI / `text-embeddings-3-small` | 0.83 | 0.90 | Cohere / `rerank-v3.5` | -->
|
|
23
23
|
|
|
@@ -36,8 +36,10 @@ haiku-rag add-src https://example.com/article.html
|
|
|
36
36
|
```
|
|
37
37
|
|
|
38
38
|
!!! note
|
|
39
|
-
As you add documents to `haiku.rag` the database keeps growing. By default,
|
|
40
|
-
of your data.
|
|
39
|
+
As you add documents to `haiku.rag` the database keeps growing. By default, LanceDB supports versioning
|
|
40
|
+
of your data. Create/update operations are atomic‑feeling: if anything fails during chunking or embedding,
|
|
41
|
+
the database rolls back to the pre‑operation snapshot using LanceDB table versioning. You can optimize and
|
|
42
|
+
compact the database by running the [vacuum](#vacuum-optimize-and-cleanup) command.
|
|
41
43
|
|
|
42
44
|
### Get Document
|
|
43
45
|
|
|
@@ -223,3 +223,35 @@ CHUNK_SIZE=256
|
|
|
223
223
|
# into single chunks with continuous content to eliminate duplication
|
|
224
224
|
CONTEXT_CHUNK_RADIUS=0
|
|
225
225
|
```
|
|
226
|
+
|
|
227
|
+
#### Markdown Preprocessor
|
|
228
|
+
|
|
229
|
+
Optionally preprocess Markdown before chunking by pointing to a callable that receives and returns Markdown text. This is useful for normalizing content, stripping boilerplate, or applying custom transformations before chunk boundaries are computed.
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# A callable path in one of these formats:
|
|
233
|
+
# - package.module:func
|
|
234
|
+
# - package.module.func
|
|
235
|
+
# - /abs/or/relative/path/to/file.py:func
|
|
236
|
+
MARKDOWN_PREPROCESSOR="my_pkg.preprocess:clean_md"
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
!!! note
|
|
240
|
+
- The function signature should be `def clean_md(text: str) -> str` or `async def clean_md(text: str) -> str`.
|
|
241
|
+
- If the function raises or returns a non-string, haiku.rag logs a warning and proceeds without preprocessing.
|
|
242
|
+
- The preprocessor affects only the chunking pipeline. The stored document content remains unchanged.
|
|
243
|
+
|
|
244
|
+
Example implementation:
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
# my_pkg/preprocess.py
|
|
248
|
+
def clean_md(text: str) -> str:
|
|
249
|
+
# strip HTML comments and collapse multiple blank lines
|
|
250
|
+
lines = [line for line in text.splitlines() if not line.strip().startswith("<!--")]
|
|
251
|
+
out = []
|
|
252
|
+
for line in lines:
|
|
253
|
+
if line.strip() == "" and (out and out[-1] == ""):
|
|
254
|
+
continue
|
|
255
|
+
out.append(line)
|
|
256
|
+
return "\n".join(out)
|
|
257
|
+
```
|
|
@@ -109,6 +109,14 @@ await client.vacuum()
|
|
|
109
109
|
|
|
110
110
|
This compacts tables and removes historical versions to keep disk usage in check. It’s safe to run anytime, for example after bulk imports or periodically in long‑running apps.
|
|
111
111
|
|
|
112
|
+
### Atomic Writes and Rollback
|
|
113
|
+
|
|
114
|
+
Document create and update operations take a snapshot of table versions before any write and automatically roll back to that snapshot if something fails (for example, during chunking or embedding). This restores both the `documents` and `chunks` tables to their pre‑operation state using LanceDB’s table versioning.
|
|
115
|
+
|
|
116
|
+
- Applies to: `create_document(...)`, `create_document_from_source(...)`, `update_document(...)`, and internal rebuild/update flows.
|
|
117
|
+
- Scope: Both document rows and all associated chunks are rolled back together.
|
|
118
|
+
- Vacuum: Running `vacuum()` later prunes old versions for disk efficiency; rollbacks occur immediately during the failing operation and are not impacted.
|
|
119
|
+
|
|
112
120
|
## Searching Documents
|
|
113
121
|
|
|
114
122
|
The search method performs native hybrid search (vector + full-text) using LanceDB with optional reranking for improved relevance:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "haiku.rag"
|
|
3
|
-
version = "0.8.
|
|
3
|
+
version = "0.8.1"
|
|
4
4
|
description = "Retrieval Augmented Generation (RAG) with LanceDB"
|
|
5
5
|
authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
|
|
6
6
|
license = { text = "MIT" }
|
|
@@ -53,6 +53,7 @@ packages = ["src/haiku"]
|
|
|
53
53
|
[dependency-groups]
|
|
54
54
|
dev = [
|
|
55
55
|
"datasets>=3.6.0",
|
|
56
|
+
"logfire>=4.6.0",
|
|
56
57
|
"mkdocs>=1.6.1",
|
|
57
58
|
"mkdocs-material>=9.6.14",
|
|
58
59
|
"pre-commit>=4.2.0",
|
|
@@ -32,6 +32,10 @@ class AppConfig(BaseModel):
|
|
|
32
32
|
CHUNK_SIZE: int = 256
|
|
33
33
|
CONTEXT_CHUNK_RADIUS: int = 0
|
|
34
34
|
|
|
35
|
+
# Optional dotted path or file path to a callable that preprocesses
|
|
36
|
+
# markdown content before chunking. Examples:
|
|
37
|
+
MARKDOWN_PREPROCESSOR: str = ""
|
|
38
|
+
|
|
35
39
|
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
|
36
40
|
VLLM_EMBEDDINGS_BASE_URL: str = ""
|
|
37
41
|
VLLM_RERANK_BASE_URL: str = ""
|
|
@@ -209,6 +209,21 @@ class Store:
|
|
|
209
209
|
# LanceDB connections are automatically managed
|
|
210
210
|
pass
|
|
211
211
|
|
|
212
|
+
def current_table_versions(self) -> dict[str, int]:
|
|
213
|
+
"""Capture current versions of key tables for rollback using LanceDB's API."""
|
|
214
|
+
return {
|
|
215
|
+
"documents": int(self.documents_table.version),
|
|
216
|
+
"chunks": int(self.chunks_table.version),
|
|
217
|
+
"settings": int(self.settings_table.version),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
def restore_table_versions(self, versions: dict[str, int]) -> bool:
|
|
221
|
+
"""Restore tables to the provided versions using LanceDB's API."""
|
|
222
|
+
self.documents_table.restore(int(versions["documents"]))
|
|
223
|
+
self.chunks_table.restore(int(versions["chunks"]))
|
|
224
|
+
self.settings_table.restore(int(versions["settings"]))
|
|
225
|
+
return True
|
|
226
|
+
|
|
212
227
|
@property
|
|
213
228
|
def _connection(self):
|
|
214
229
|
"""Compatibility property for repositories expecting _connection."""
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
from uuid import uuid4
|
|
@@ -11,6 +12,7 @@ from haiku.rag.config import Config
|
|
|
11
12
|
from haiku.rag.embeddings import get_embedder
|
|
12
13
|
from haiku.rag.store.engine import DocumentRecord, Store
|
|
13
14
|
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
+
from haiku.rag.utils import load_callable, text_to_docling_document
|
|
14
16
|
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
@@ -152,7 +154,28 @@ class ChunkRepository:
|
|
|
152
154
|
self, document_id: str, document: DoclingDocument
|
|
153
155
|
) -> list[Chunk]:
|
|
154
156
|
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
155
|
-
|
|
157
|
+
# Optionally preprocess markdown before chunking
|
|
158
|
+
processed_document = document
|
|
159
|
+
preprocessor_path = Config.MARKDOWN_PREPROCESSOR
|
|
160
|
+
if preprocessor_path:
|
|
161
|
+
try:
|
|
162
|
+
pre_fn = load_callable(preprocessor_path)
|
|
163
|
+
markdown = document.export_to_markdown()
|
|
164
|
+
result = pre_fn(markdown)
|
|
165
|
+
if inspect.isawaitable(result):
|
|
166
|
+
result = await result # type: ignore[assignment]
|
|
167
|
+
processed_markdown = result
|
|
168
|
+
if not isinstance(processed_markdown, str):
|
|
169
|
+
raise ValueError("Preprocessor must return a markdown string")
|
|
170
|
+
processed_document = text_to_docling_document(
|
|
171
|
+
processed_markdown, name="content.md"
|
|
172
|
+
)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.warning(
|
|
175
|
+
f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
chunk_texts = await chunker.chunk(processed_document)
|
|
156
179
|
|
|
157
180
|
embeddings = await self.embedder.embed(chunk_texts)
|
|
158
181
|
|
|
@@ -171,44 +171,64 @@ class DocumentRepository:
|
|
|
171
171
|
chunks: list["Chunk"] | None = None,
|
|
172
172
|
) -> Document:
|
|
173
173
|
"""Create a document with its chunks and embeddings."""
|
|
174
|
+
# Snapshot table versions for versioned rollback (if supported)
|
|
175
|
+
versions = self.store.current_table_versions()
|
|
176
|
+
|
|
174
177
|
# Create the document
|
|
175
178
|
created_doc = await self.create(entity)
|
|
176
179
|
|
|
177
|
-
#
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
180
|
+
# Attempt to create chunks; on failure, prefer version rollback
|
|
181
|
+
try:
|
|
182
|
+
# Create chunks if not provided
|
|
183
|
+
if chunks is None:
|
|
184
|
+
assert created_doc.id is not None, (
|
|
185
|
+
"Document ID should not be None after creation"
|
|
186
|
+
)
|
|
187
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
188
|
+
created_doc.id, docling_document
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
# Use provided chunks, set order from list position
|
|
192
|
+
assert created_doc.id is not None, (
|
|
193
|
+
"Document ID should not be None after creation"
|
|
194
|
+
)
|
|
195
|
+
for order, chunk in enumerate(chunks):
|
|
196
|
+
chunk.document_id = created_doc.id
|
|
197
|
+
chunk.metadata["order"] = order
|
|
198
|
+
await self.chunk_repository.create(chunk)
|
|
199
|
+
|
|
200
|
+
return created_doc
|
|
201
|
+
except Exception:
|
|
202
|
+
# Roll back to the captured versions and re-raise
|
|
203
|
+
self.store.restore_table_versions(versions)
|
|
204
|
+
raise
|
|
196
205
|
|
|
197
206
|
async def _update_with_docling(
|
|
198
207
|
self, entity: Document, docling_document: DoclingDocument
|
|
199
208
|
) -> Document:
|
|
200
209
|
"""Update a document and regenerate its chunks."""
|
|
201
|
-
# Delete existing chunks
|
|
202
210
|
assert entity.id is not None, "Document ID is required for update"
|
|
211
|
+
|
|
212
|
+
# Snapshot table versions for versioned rollback
|
|
213
|
+
versions = self.store.current_table_versions()
|
|
214
|
+
|
|
215
|
+
# Delete existing chunks before writing new ones
|
|
203
216
|
await self.chunk_repository.delete_by_document_id(entity.id)
|
|
204
217
|
|
|
205
|
-
|
|
206
|
-
|
|
218
|
+
try:
|
|
219
|
+
# Update the document
|
|
220
|
+
updated_doc = await self.update(entity)
|
|
207
221
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
222
|
+
# Create new chunks
|
|
223
|
+
assert updated_doc.id is not None, (
|
|
224
|
+
"Document ID should not be None after update"
|
|
225
|
+
)
|
|
226
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
227
|
+
updated_doc.id, docling_document
|
|
228
|
+
)
|
|
213
229
|
|
|
214
|
-
|
|
230
|
+
return updated_doc
|
|
231
|
+
except Exception:
|
|
232
|
+
# Roll back to the captured versions and re-raise
|
|
233
|
+
self.store.restore_table_versions(versions)
|
|
234
|
+
raise
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import importlib
|
|
3
|
+
import importlib.util
|
|
2
4
|
import sys
|
|
3
5
|
from collections.abc import Callable
|
|
4
6
|
from functools import wraps
|
|
5
7
|
from importlib import metadata
|
|
6
8
|
from io import BytesIO
|
|
7
9
|
from pathlib import Path
|
|
10
|
+
from types import ModuleType
|
|
8
11
|
|
|
9
12
|
import httpx
|
|
10
13
|
from docling.document_converter import DocumentConverter
|
|
@@ -106,3 +109,54 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
|
|
|
106
109
|
converter = DocumentConverter()
|
|
107
110
|
result = converter.convert(doc_stream)
|
|
108
111
|
return result.document
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def load_callable(path: str):
|
|
115
|
+
"""Load a callable from a dotted path or file path.
|
|
116
|
+
|
|
117
|
+
Supported formats:
|
|
118
|
+
- "package.module:func" or "package.module.func"
|
|
119
|
+
- "path/to/file.py:func"
|
|
120
|
+
|
|
121
|
+
Returns the loaded callable. Raises ValueError on failure.
|
|
122
|
+
"""
|
|
123
|
+
if not path:
|
|
124
|
+
raise ValueError("Empty callable path provided")
|
|
125
|
+
|
|
126
|
+
module_part = None
|
|
127
|
+
func_name = None
|
|
128
|
+
|
|
129
|
+
if ":" in path:
|
|
130
|
+
module_part, func_name = path.split(":", 1)
|
|
131
|
+
else:
|
|
132
|
+
# split by last dot for module.attr
|
|
133
|
+
if "." in path:
|
|
134
|
+
module_part, func_name = path.rsplit(".", 1)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Invalid callable path format. Use 'module:func' or 'module.func' or 'file.py:func'."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Try file path first
|
|
141
|
+
mod: ModuleType | None = None
|
|
142
|
+
module_path = Path(module_part)
|
|
143
|
+
if module_path.suffix == ".py" and module_path.exists():
|
|
144
|
+
spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
|
|
145
|
+
if spec and spec.loader:
|
|
146
|
+
mod = importlib.util.module_from_spec(spec)
|
|
147
|
+
spec.loader.exec_module(mod)
|
|
148
|
+
else:
|
|
149
|
+
# Import as a module path
|
|
150
|
+
try:
|
|
151
|
+
mod = importlib.import_module(module_part)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
raise ValueError(f"Failed to import module '{module_part}': {e}")
|
|
154
|
+
|
|
155
|
+
if not hasattr(mod, func_name):
|
|
156
|
+
raise ValueError(f"Callable '{func_name}' not found in module '{module_part}'")
|
|
157
|
+
func = getattr(mod, func_name)
|
|
158
|
+
if not callable(func):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Attribute '{func_name}' in module '{module_part}' is not callable"
|
|
161
|
+
)
|
|
162
|
+
return func
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
import logfire
|
|
4
5
|
from datasets import Dataset, load_dataset
|
|
5
6
|
from llm_judge import LLMJudge
|
|
6
7
|
from rich.console import Console
|
|
@@ -11,6 +12,8 @@ from haiku.rag.client import HaikuRAG
|
|
|
11
12
|
from haiku.rag.logging import configure_cli_logging
|
|
12
13
|
from haiku.rag.qa import get_qa_agent
|
|
13
14
|
|
|
15
|
+
logfire.configure()
|
|
16
|
+
logfire.instrument_pydantic_ai()
|
|
14
17
|
configure_cli_logging()
|
|
15
18
|
console = Console()
|
|
16
19
|
|
|
@@ -119,7 +122,6 @@ async def run_qa_benchmark(k: int | None = None):
|
|
|
119
122
|
|
|
120
123
|
async with HaikuRAG(db_path) as rag:
|
|
121
124
|
qa = get_qa_agent(rag)
|
|
122
|
-
|
|
123
125
|
for doc in corpus:
|
|
124
126
|
question = doc["question"] # type: ignore
|
|
125
127
|
expected_answer = doc["answer"] # type: ignore
|
|
@@ -526,121 +526,117 @@ async def test_client_ask_with_cite(temp_db_path):
|
|
|
526
526
|
@pytest.mark.asyncio
|
|
527
527
|
async def test_client_expand_context(temp_db_path):
|
|
528
528
|
"""Test expanding search results with adjacent chunks."""
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
Chunk(content="Chunk 4 content", metadata={"order": 4}),
|
|
539
|
-
]
|
|
540
|
-
|
|
541
|
-
doc = await client.create_document(
|
|
542
|
-
content="Full document content",
|
|
543
|
-
uri="test_doc.txt",
|
|
544
|
-
chunks=manual_chunks,
|
|
545
|
-
)
|
|
529
|
+
async with HaikuRAG(temp_db_path) as client:
|
|
530
|
+
# Create chunks manually
|
|
531
|
+
manual_chunks = [
|
|
532
|
+
Chunk(content="Chunk 0 content", metadata={"order": 0}),
|
|
533
|
+
Chunk(content="Chunk 1 content", metadata={"order": 1}),
|
|
534
|
+
Chunk(content="Chunk 2 content", metadata={"order": 2}),
|
|
535
|
+
Chunk(content="Chunk 3 content", metadata={"order": 3}),
|
|
536
|
+
Chunk(content="Chunk 4 content", metadata={"order": 4}),
|
|
537
|
+
]
|
|
546
538
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
539
|
+
doc = await client.create_document(
|
|
540
|
+
content="Full document content",
|
|
541
|
+
uri="test_doc.txt",
|
|
542
|
+
chunks=manual_chunks,
|
|
543
|
+
)
|
|
551
544
|
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
545
|
+
# Get all chunks for the document
|
|
546
|
+
assert doc.id is not None
|
|
547
|
+
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
548
|
+
assert len(chunks) == 5
|
|
555
549
|
|
|
556
|
-
|
|
557
|
-
|
|
550
|
+
# Find the middle chunk (order=2)
|
|
551
|
+
middle_chunk = next(c for c in chunks if c.metadata.get("order") == 2)
|
|
552
|
+
search_results = [(middle_chunk, 0.8)]
|
|
558
553
|
|
|
559
|
-
|
|
560
|
-
|
|
554
|
+
# Test expand_context with radius=2
|
|
555
|
+
expanded_results = await client.expand_context(search_results, radius=2)
|
|
561
556
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
assert score == 0.8
|
|
565
|
-
assert "Chunk 2 content" in expanded_chunk.content
|
|
557
|
+
assert len(expanded_results) == 1
|
|
558
|
+
expanded_chunk, score = expanded_results[0]
|
|
566
559
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
560
|
+
# Check that the expanded chunk has combined content
|
|
561
|
+
assert expanded_chunk.id == middle_chunk.id
|
|
562
|
+
assert score == 0.8
|
|
563
|
+
assert "Chunk 2 content" in expanded_chunk.content
|
|
564
|
+
|
|
565
|
+
# Should include all chunks (radius=2 from chunk 2 = chunks 0,1,2,3,4)
|
|
566
|
+
assert "Chunk 0 content" in expanded_chunk.content
|
|
567
|
+
assert "Chunk 1 content" in expanded_chunk.content
|
|
568
|
+
assert "Chunk 2 content" in expanded_chunk.content
|
|
569
|
+
assert "Chunk 3 content" in expanded_chunk.content
|
|
570
|
+
assert "Chunk 4 content" in expanded_chunk.content
|
|
573
571
|
|
|
574
572
|
|
|
575
573
|
@pytest.mark.asyncio
|
|
576
574
|
async def test_client_expand_context_radius_zero(temp_db_path):
|
|
577
575
|
"""Test expand_context with radius 0 returns original results."""
|
|
578
|
-
with
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
576
|
+
async with HaikuRAG(temp_db_path) as client:
|
|
577
|
+
# Create a simple document
|
|
578
|
+
doc = await client.create_document(content="Simple test content")
|
|
579
|
+
assert doc.id is not None
|
|
580
|
+
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
584
581
|
|
|
585
|
-
|
|
586
|
-
|
|
582
|
+
search_results = [(chunks[0], 0.9)]
|
|
583
|
+
expanded_results = await client.expand_context(search_results, radius=0)
|
|
587
584
|
|
|
588
|
-
|
|
589
|
-
|
|
585
|
+
# Should return exactly the same results
|
|
586
|
+
assert expanded_results == search_results
|
|
590
587
|
|
|
591
588
|
|
|
592
589
|
@pytest.mark.asyncio
|
|
593
590
|
async def test_client_expand_context_multiple_chunks(temp_db_path):
|
|
594
591
|
"""Test expand_context with multiple search results."""
|
|
595
|
-
with
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
)
|
|
592
|
+
async with HaikuRAG(temp_db_path) as client:
|
|
593
|
+
# Create first document with manual chunks
|
|
594
|
+
doc1_chunks = [
|
|
595
|
+
Chunk(content="Doc1 Part A", metadata={"order": 0}),
|
|
596
|
+
Chunk(content="Doc1 Part B", metadata={"order": 1}),
|
|
597
|
+
Chunk(content="Doc1 Part C", metadata={"order": 2}),
|
|
598
|
+
]
|
|
599
|
+
doc1 = await client.create_document(
|
|
600
|
+
content="Doc1 content", uri="doc1.txt", chunks=doc1_chunks
|
|
601
|
+
)
|
|
606
602
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
603
|
+
# Create second document with manual chunks
|
|
604
|
+
doc2_chunks = [
|
|
605
|
+
Chunk(content="Doc2 Section X", metadata={"order": 0}),
|
|
606
|
+
Chunk(content="Doc2 Section Y", metadata={"order": 1}),
|
|
607
|
+
]
|
|
608
|
+
doc2 = await client.create_document(
|
|
609
|
+
content="Doc2 content", uri="doc2.txt", chunks=doc2_chunks
|
|
610
|
+
)
|
|
615
611
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
612
|
+
assert doc1.id is not None
|
|
613
|
+
assert doc2.id is not None
|
|
614
|
+
chunks1 = await client.chunk_repository.get_by_document_id(doc1.id)
|
|
615
|
+
chunks2 = await client.chunk_repository.get_by_document_id(doc2.id)
|
|
616
|
+
|
|
617
|
+
# Get middle chunk from doc1 (order=1) and first chunk from doc2 (order=0)
|
|
618
|
+
chunk1 = next(c for c in chunks1 if c.metadata.get("order") == 1)
|
|
619
|
+
chunk2 = next(c for c in chunks2 if c.metadata.get("order") == 0)
|
|
620
|
+
|
|
621
|
+
search_results = [(chunk1, 0.8), (chunk2, 0.7)]
|
|
622
|
+
expanded_results = await client.expand_context(search_results, radius=1)
|
|
623
|
+
|
|
624
|
+
assert len(expanded_results) == 2
|
|
625
|
+
|
|
626
|
+
# Check first expanded result (should include chunks 0,1,2 from doc1)
|
|
627
|
+
expanded1, score1 = expanded_results[0]
|
|
628
|
+
assert expanded1.id == chunk1.id
|
|
629
|
+
assert score1 == 0.8
|
|
630
|
+
assert "Doc1 Part A" in expanded1.content
|
|
631
|
+
assert "Doc1 Part B" in expanded1.content
|
|
632
|
+
assert "Doc1 Part C" in expanded1.content
|
|
633
|
+
|
|
634
|
+
# Check second expanded result (should include chunks 0,1 from doc2)
|
|
635
|
+
expanded2, score2 = expanded_results[1]
|
|
636
|
+
assert expanded2.id == chunk2.id
|
|
637
|
+
assert score2 == 0.7
|
|
638
|
+
assert "Doc2 Section X" in expanded2.content
|
|
639
|
+
assert "Doc2 Section Y" in expanded2.content
|
|
644
640
|
|
|
645
641
|
|
|
646
642
|
@pytest.mark.asyncio
|