haiku.rag 0.7.7__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/PKG-INFO +1 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/benchmarks.md +2 -2
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/cli.md +70 -53
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/configuration.md +32 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/index.md +0 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/python.md +18 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/pyproject.toml +2 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/app.py +9 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/cli.py +12 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/client.py +10 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/config.py +4 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/logging.py +3 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/migration.py +3 -3
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/engine.py +33 -6
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/chunk.py +24 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/document.py +48 -28
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/settings.py +8 -3
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/utils.py +54 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/generate_benchmark_db.py +6 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_client.py +91 -95
- haiku_rag-0.8.1/tests/test_preprocessor.py +71 -0
- haiku_rag-0.8.1/tests/test_versioning.py +94 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/uv.lock +175 -1
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.gitignore +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.python-version +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/LICENSE +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/README.md +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/installation.md +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/mcp.md +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/server.md +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/mkdocs.yml +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/chunker.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/vllm.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/agent.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reader.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/vllm.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/__init__.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/conftest.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/llm_judge.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_app.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_chunk.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_chunker.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_cli.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_document.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_embedder.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_lancedb_connection.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_monitor.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_qa.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_reader.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_reranker.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_search.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_settings.py +0 -0
- {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_utils.py +0 -0
|
@@ -16,8 +16,8 @@ The recall obtained is ~0.79 for matching in the top result, raising to ~0.91 fo
|
|
|
16
16
|
|---------------------------------------|-------------------|-------------------|------------------------|
|
|
17
17
|
| Ollama / `mxbai-embed-large` | 0.79 | 0.91 | None |
|
|
18
18
|
| Ollama / `mxbai-embed-large` | 0.90 | 0.95 | `mxbai-rerank-base-v2` |
|
|
19
|
-
|
|
20
|
-
| OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
19
|
+
| Ollama / `nomic-embed-text-v1.5` | 0.74 | 0.90 | None |
|
|
20
|
+
<!-- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
21
21
|
| OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
|
|
22
22
|
| OpenAI / `text-embeddings-3-small` | 0.83 | 0.90 | Cohere / `rerank-v3.5` | -->
|
|
23
23
|
|
|
@@ -2,22 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
The `haiku-rag` CLI provides complete document management functionality.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Enable shell autocompletion for faster, error‑free usage.
|
|
5
|
+
!!! note
|
|
6
|
+
All commands support:
|
|
8
7
|
|
|
9
|
-
-
|
|
10
|
-
|
|
11
|
-
eval "$(haiku-rag --show-completion)"
|
|
12
|
-
```
|
|
13
|
-
- Permanent installation:
|
|
14
|
-
```bash
|
|
15
|
-
haiku-rag --install-completion
|
|
16
|
-
```
|
|
8
|
+
- `--db` - Specify custom database path
|
|
9
|
+
- `-h` - Show help for specific command
|
|
17
10
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
-
|
|
11
|
+
Example:
|
|
12
|
+
```bash
|
|
13
|
+
haiku-rag list --db /path/to/custom.db
|
|
14
|
+
haiku-rag add -h
|
|
15
|
+
```
|
|
21
16
|
|
|
22
17
|
## Document Management
|
|
23
18
|
|
|
@@ -40,6 +35,12 @@ haiku-rag add-src /path/to/document.pdf
|
|
|
40
35
|
haiku-rag add-src https://example.com/article.html
|
|
41
36
|
```
|
|
42
37
|
|
|
38
|
+
!!! note
|
|
39
|
+
As you add documents to `haiku.rag` the database keeps growing. By default, LanceDB supports versioning
|
|
40
|
+
of your data. Create/update operations are atomic‑feeling: if anything fails during chunking or embedding,
|
|
41
|
+
the database rolls back to the pre‑operation snapshot using LanceDB table versioning. You can optimize and
|
|
42
|
+
compact the database by running the [vacuum](#vacuum-optimize-and-cleanup) command.
|
|
43
|
+
|
|
43
44
|
### Get Document
|
|
44
45
|
|
|
45
46
|
```bash
|
|
@@ -55,33 +56,8 @@ haiku-rag delete <TAB>
|
|
|
55
56
|
haiku-rag rm <TAB> # alias
|
|
56
57
|
```
|
|
57
58
|
|
|
58
|
-
### Rebuild Database
|
|
59
|
-
|
|
60
|
-
Rebuild the database by deleting all chunks & embeddings and re-indexing all documents:
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
haiku-rag rebuild
|
|
64
|
-
```
|
|
65
|
-
|
|
66
59
|
Use this when you want to change things like the embedding model or chunk size for example.
|
|
67
60
|
|
|
68
|
-
## Migration
|
|
69
|
-
|
|
70
|
-
### Migrate from SQLite to LanceDB
|
|
71
|
-
|
|
72
|
-
Migrate an existing SQLite database to LanceDB:
|
|
73
|
-
|
|
74
|
-
```bash
|
|
75
|
-
haiku-rag migrate /path/to/old_database.sqlite
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
This will:
|
|
79
|
-
- Read all documents, chunks, embeddings, and settings from the SQLite database
|
|
80
|
-
- Create a new LanceDB database with the same data in the same directory
|
|
81
|
-
- Optimize the new database for best performance
|
|
82
|
-
|
|
83
|
-
The original SQLite database remains unchanged, so you can safely migrate without risk of data loss.
|
|
84
|
-
|
|
85
61
|
## Search
|
|
86
62
|
|
|
87
63
|
Basic search:
|
|
@@ -108,13 +84,6 @@ haiku-rag ask "Who is the author of haiku.rag?" --cite
|
|
|
108
84
|
|
|
109
85
|
The QA agent will search your documents for relevant information and provide a comprehensive answer. With `--cite`, responses include citations showing which documents were used.
|
|
110
86
|
|
|
111
|
-
## Configuration
|
|
112
|
-
|
|
113
|
-
View current configuration settings:
|
|
114
|
-
```bash
|
|
115
|
-
haiku-rag settings
|
|
116
|
-
```
|
|
117
|
-
|
|
118
87
|
## Server
|
|
119
88
|
|
|
120
89
|
Start the MCP server:
|
|
@@ -129,14 +98,62 @@ haiku-rag serve --stdio
|
|
|
129
98
|
haiku-rag serve --sse
|
|
130
99
|
```
|
|
131
100
|
|
|
132
|
-
##
|
|
101
|
+
## Settings
|
|
133
102
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
-
|
|
103
|
+
View current configuration settings:
|
|
104
|
+
```bash
|
|
105
|
+
haiku-rag settings
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Maintenance
|
|
109
|
+
|
|
110
|
+
### Vacuum (Optimize and Cleanup)
|
|
111
|
+
|
|
112
|
+
Reduce disk usage by optimizing and pruning old table versions across all tables:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
haiku-rag vacuum
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Rebuild Database
|
|
119
|
+
|
|
120
|
+
Rebuild the database by deleting all chunks & embeddings and re-indexing all documents. This is useful
|
|
121
|
+
when want to switch embeddings provider or model:
|
|
137
122
|
|
|
138
|
-
Example:
|
|
139
123
|
```bash
|
|
140
|
-
haiku-rag
|
|
141
|
-
haiku-rag add -h
|
|
124
|
+
haiku-rag rebuild
|
|
142
125
|
```
|
|
126
|
+
|
|
127
|
+
## Migration
|
|
128
|
+
|
|
129
|
+
### Migrate from SQLite to LanceDB
|
|
130
|
+
|
|
131
|
+
Migrate an existing SQLite database to LanceDB:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
haiku-rag migrate /path/to/old_database.sqlite
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
This will:
|
|
138
|
+
- Read all documents, chunks, embeddings, and settings from the SQLite database
|
|
139
|
+
- Create a new LanceDB database with the same data in the same directory
|
|
140
|
+
- Optimize the new database for best performance
|
|
141
|
+
|
|
142
|
+
The original SQLite database remains unchanged, so you can safely migrate without risk of data loss.
|
|
143
|
+
|
|
144
|
+
## Shell Autocompletion
|
|
145
|
+
|
|
146
|
+
Enable shell autocompletion for faster, error‑free usage.
|
|
147
|
+
|
|
148
|
+
- Temporary (current shell only):
|
|
149
|
+
```bash
|
|
150
|
+
eval "$(haiku-rag --show-completion)"
|
|
151
|
+
```
|
|
152
|
+
- Permanent installation:
|
|
153
|
+
```bash
|
|
154
|
+
haiku-rag --install-completion
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
What’s completed:
|
|
158
|
+
- `get` and `delete`/`rm`: Document IDs from the selected database (respects `--db`).
|
|
159
|
+
- `add-src`: Local filesystem paths (URLs can still be typed manually).
|
|
@@ -223,3 +223,35 @@ CHUNK_SIZE=256
|
|
|
223
223
|
# into single chunks with continuous content to eliminate duplication
|
|
224
224
|
CONTEXT_CHUNK_RADIUS=0
|
|
225
225
|
```
|
|
226
|
+
|
|
227
|
+
#### Markdown Preprocessor
|
|
228
|
+
|
|
229
|
+
Optionally preprocess Markdown before chunking by pointing to a callable that receives and returns Markdown text. This is useful for normalizing content, stripping boilerplate, or applying custom transformations before chunk boundaries are computed.
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# A callable path in one of these formats:
|
|
233
|
+
# - package.module:func
|
|
234
|
+
# - package.module.func
|
|
235
|
+
# - /abs/or/relative/path/to/file.py:func
|
|
236
|
+
MARKDOWN_PREPROCESSOR="my_pkg.preprocess:clean_md"
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
!!! note
|
|
240
|
+
- The function signature should be `def clean_md(text: str) -> str` or `async def clean_md(text: str) -> str`.
|
|
241
|
+
- If the function raises or returns a non-string, haiku.rag logs a warning and proceeds without preprocessing.
|
|
242
|
+
- The preprocessor affects only the chunking pipeline. The stored document content remains unchanged.
|
|
243
|
+
|
|
244
|
+
Example implementation:
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
# my_pkg/preprocess.py
|
|
248
|
+
def clean_md(text: str) -> str:
|
|
249
|
+
# strip HTML comments and collapse multiple blank lines
|
|
250
|
+
lines = [line for line in text.splitlines() if not line.strip().startswith("<!--")]
|
|
251
|
+
out = []
|
|
252
|
+
for line in lines:
|
|
253
|
+
if line.strip() == "" and (out and out[-1] == ""):
|
|
254
|
+
continue
|
|
255
|
+
out.append(line)
|
|
256
|
+
return "\n".join(out)
|
|
257
|
+
```
|
|
@@ -52,7 +52,6 @@ haiku-rag migrate old_database.sqlite # Migrate from SQLite
|
|
|
52
52
|
- [Installation](installation.md) - Install haiku.rag with different providers
|
|
53
53
|
- [Configuration](configuration.md) - Environment variables and settings
|
|
54
54
|
- [CLI](cli.md) - Command line interface usage
|
|
55
|
-
- [Question Answering](qa.md) - QA agents and natural language queries
|
|
56
55
|
- [Server](server.md) - File monitoring and server mode
|
|
57
56
|
- [MCP](mcp.md) - Model Context Protocol integration
|
|
58
57
|
- [Python](python.md) - Python API reference
|
|
@@ -99,6 +99,24 @@ async for doc_id in client.rebuild_database():
|
|
|
99
99
|
print(f"Processed document {doc_id}")
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
+
## Maintenance
|
|
103
|
+
|
|
104
|
+
Run maintenance to optimize storage and prune old table versions:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
await client.vacuum()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
This compacts tables and removes historical versions to keep disk usage in check. It’s safe to run anytime, for example after bulk imports or periodically in long‑running apps.
|
|
111
|
+
|
|
112
|
+
### Atomic Writes and Rollback
|
|
113
|
+
|
|
114
|
+
Document create and update operations take a snapshot of table versions before any write and automatically roll back to that snapshot if something fails (for example, during chunking or embedding). This restores both the `documents` and `chunks` tables to their pre‑operation state using LanceDB’s table versioning.
|
|
115
|
+
|
|
116
|
+
- Applies to: `create_document(...)`, `create_document_from_source(...)`, `update_document(...)`, and internal rebuild/update flows.
|
|
117
|
+
- Scope: Both document rows and all associated chunks are rolled back together.
|
|
118
|
+
- Vacuum: Running `vacuum()` later prunes old versions for disk efficiency; rollbacks occur immediately during the failing operation and are not impacted.
|
|
119
|
+
|
|
102
120
|
## Searching Documents
|
|
103
121
|
|
|
104
122
|
The search method performs native hybrid search (vector + full-text) using LanceDB with optional reranking for improved relevance:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "haiku.rag"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.8.1"
|
|
4
4
|
description = "Retrieval Augmented Generation (RAG) with LanceDB"
|
|
5
5
|
authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
|
|
6
6
|
license = { text = "MIT" }
|
|
@@ -53,6 +53,7 @@ packages = ["src/haiku"]
|
|
|
53
53
|
[dependency-groups]
|
|
54
54
|
dev = [
|
|
55
55
|
"datasets>=3.6.0",
|
|
56
|
+
"logfire>=4.6.0",
|
|
56
57
|
"mkdocs>=1.6.1",
|
|
57
58
|
"mkdocs-material>=9.6.14",
|
|
58
59
|
"pre-commit>=4.2.0",
|
|
@@ -102,6 +102,15 @@ class HaikuRAGApp:
|
|
|
102
102
|
except Exception as e:
|
|
103
103
|
self.console.print(f"[red]Error rebuilding database: {e}[/red]")
|
|
104
104
|
|
|
105
|
+
async def vacuum(self):
|
|
106
|
+
"""Run database maintenance: optimize and cleanup table history."""
|
|
107
|
+
try:
|
|
108
|
+
async with HaikuRAG(db_path=self.db_path, skip_validation=True) as client:
|
|
109
|
+
await client.vacuum()
|
|
110
|
+
self.console.print("[b]Vacuum completed successfully.[/b]")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
self.console.print(f"[red]Error during vacuum: {e}[/red]")
|
|
113
|
+
|
|
105
114
|
def show_settings(self):
|
|
106
115
|
"""Display current configuration settings."""
|
|
107
116
|
self.console.print("[bold]haiku.rag configuration[/bold]")
|
|
@@ -256,6 +256,18 @@ def rebuild(
|
|
|
256
256
|
asyncio.run(app.rebuild())
|
|
257
257
|
|
|
258
258
|
|
|
259
|
+
@cli.command("vacuum", help="Optimize and clean up all tables to reduce disk usage")
|
|
260
|
+
def vacuum(
|
|
261
|
+
db: Path = typer.Option(
|
|
262
|
+
Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
|
|
263
|
+
"--db",
|
|
264
|
+
help="Path to the LanceDB database file",
|
|
265
|
+
),
|
|
266
|
+
):
|
|
267
|
+
app = HaikuRAGApp(db_path=db)
|
|
268
|
+
asyncio.run(app.vacuum())
|
|
269
|
+
|
|
270
|
+
|
|
259
271
|
@cli.command(
|
|
260
272
|
"serve", help="Start the haiku.rag MCP server (by default in streamable HTTP mode)"
|
|
261
273
|
)
|
|
@@ -550,6 +550,16 @@ class HaikuRAG:
|
|
|
550
550
|
)
|
|
551
551
|
yield doc.id
|
|
552
552
|
|
|
553
|
+
# Final maintenance: centralized vacuum to curb disk usage
|
|
554
|
+
try:
|
|
555
|
+
self.store.vacuum()
|
|
556
|
+
except Exception:
|
|
557
|
+
pass
|
|
558
|
+
|
|
559
|
+
async def vacuum(self) -> None:
|
|
560
|
+
"""Optimize and clean up old versions across all tables."""
|
|
561
|
+
self.store.vacuum()
|
|
562
|
+
|
|
553
563
|
def close(self):
|
|
554
564
|
"""Close the underlying store connection."""
|
|
555
565
|
self.store.close()
|
|
@@ -32,6 +32,10 @@ class AppConfig(BaseModel):
|
|
|
32
32
|
CHUNK_SIZE: int = 256
|
|
33
33
|
CONTEXT_CHUNK_RADIUS: int = 0
|
|
34
34
|
|
|
35
|
+
# Optional dotted path or file path to a callable that preprocesses
|
|
36
|
+
# markdown content before chunking. Examples:
|
|
37
|
+
MARKDOWN_PREPROCESSOR: str = ""
|
|
38
|
+
|
|
35
39
|
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
|
36
40
|
VLLM_EMBEDDINGS_BASE_URL: str = ""
|
|
37
41
|
VLLM_RERANK_BASE_URL: str = ""
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
|
|
3
4
|
from rich.console import Console
|
|
4
5
|
from rich.logging import RichHandler
|
|
@@ -50,4 +51,6 @@ def configure_cli_logging(level: int = logging.INFO) -> logging.Logger:
|
|
|
50
51
|
logger = get_logger()
|
|
51
52
|
logger.setLevel(level)
|
|
52
53
|
logger.propagate = False
|
|
54
|
+
|
|
55
|
+
warnings.filterwarnings("ignore")
|
|
53
56
|
return logger
|
|
@@ -47,7 +47,7 @@ class SQLiteToLanceDBMigrator:
|
|
|
47
47
|
|
|
48
48
|
# Load the sqlite-vec extension
|
|
49
49
|
try:
|
|
50
|
-
import sqlite_vec
|
|
50
|
+
import sqlite_vec # type: ignore
|
|
51
51
|
|
|
52
52
|
sqlite_conn.enable_load_extension(True)
|
|
53
53
|
sqlite_vec.load(sqlite_conn)
|
|
@@ -91,10 +91,10 @@ class SQLiteToLanceDBMigrator:
|
|
|
91
91
|
|
|
92
92
|
sqlite_conn.close()
|
|
93
93
|
|
|
94
|
-
# Optimize
|
|
94
|
+
# Optimize and cleanup using centralized vacuum
|
|
95
95
|
self.console.print("[blue]Optimizing LanceDB...[/blue]")
|
|
96
96
|
try:
|
|
97
|
-
lance_store.
|
|
97
|
+
lance_store.vacuum()
|
|
98
98
|
self.console.print("[green]✅ Optimization completed[/green]")
|
|
99
99
|
except Exception as e:
|
|
100
100
|
self.console.print(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from datetime import timedelta
|
|
3
4
|
from importlib import metadata
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from uuid import uuid4
|
|
@@ -62,6 +63,15 @@ class Store:
|
|
|
62
63
|
if not skip_validation:
|
|
63
64
|
self._validate_configuration()
|
|
64
65
|
|
|
66
|
+
def vacuum(self) -> None:
|
|
67
|
+
"""Optimize and clean up old versions across all tables to reduce disk usage."""
|
|
68
|
+
if self._has_cloud_config() and str(Config.LANCEDB_URI).startswith("db://"):
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# Perform maintenance per table using optimize() with cleanup_older_than 0
|
|
72
|
+
for table in [self.documents_table, self.chunks_table, self.settings_table]:
|
|
73
|
+
table.optimize(cleanup_older_than=timedelta(0))
|
|
74
|
+
|
|
65
75
|
def _connect_to_lancedb(self, db_path: Path):
|
|
66
76
|
"""Establish connection to LanceDB (local, cloud, or object storage)."""
|
|
67
77
|
# Check if we have cloud configuration
|
|
@@ -159,16 +169,18 @@ class Store:
|
|
|
159
169
|
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
160
170
|
)
|
|
161
171
|
if settings_records:
|
|
162
|
-
|
|
172
|
+
# Only write if version actually changes to avoid creating new table versions
|
|
173
|
+
current = (
|
|
163
174
|
json.loads(settings_records[0].settings)
|
|
164
175
|
if settings_records[0].settings
|
|
165
176
|
else {}
|
|
166
177
|
)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
178
|
+
if current.get("version") != version:
|
|
179
|
+
current["version"] = version
|
|
180
|
+
self.settings_table.update(
|
|
181
|
+
where="id = 'settings'",
|
|
182
|
+
values={"settings": json.dumps(current)},
|
|
183
|
+
)
|
|
172
184
|
else:
|
|
173
185
|
# Create new settings record
|
|
174
186
|
settings_data = Config.model_dump(mode="json")
|
|
@@ -197,6 +209,21 @@ class Store:
|
|
|
197
209
|
# LanceDB connections are automatically managed
|
|
198
210
|
pass
|
|
199
211
|
|
|
212
|
+
def current_table_versions(self) -> dict[str, int]:
|
|
213
|
+
"""Capture current versions of key tables for rollback using LanceDB's API."""
|
|
214
|
+
return {
|
|
215
|
+
"documents": int(self.documents_table.version),
|
|
216
|
+
"chunks": int(self.chunks_table.version),
|
|
217
|
+
"settings": int(self.settings_table.version),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
def restore_table_versions(self, versions: dict[str, int]) -> bool:
|
|
221
|
+
"""Restore tables to the provided versions using LanceDB's API."""
|
|
222
|
+
self.documents_table.restore(int(versions["documents"]))
|
|
223
|
+
self.chunks_table.restore(int(versions["chunks"]))
|
|
224
|
+
self.settings_table.restore(int(versions["settings"]))
|
|
225
|
+
return True
|
|
226
|
+
|
|
200
227
|
@property
|
|
201
228
|
def _connection(self):
|
|
202
229
|
"""Compatibility property for repositories expecting _connection."""
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
from uuid import uuid4
|
|
@@ -11,6 +12,7 @@ from haiku.rag.config import Config
|
|
|
11
12
|
from haiku.rag.embeddings import get_embedder
|
|
12
13
|
from haiku.rag.store.engine import DocumentRecord, Store
|
|
13
14
|
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
+
from haiku.rag.utils import load_callable, text_to_docling_document
|
|
14
16
|
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
@@ -152,7 +154,28 @@ class ChunkRepository:
|
|
|
152
154
|
self, document_id: str, document: DoclingDocument
|
|
153
155
|
) -> list[Chunk]:
|
|
154
156
|
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
155
|
-
|
|
157
|
+
# Optionally preprocess markdown before chunking
|
|
158
|
+
processed_document = document
|
|
159
|
+
preprocessor_path = Config.MARKDOWN_PREPROCESSOR
|
|
160
|
+
if preprocessor_path:
|
|
161
|
+
try:
|
|
162
|
+
pre_fn = load_callable(preprocessor_path)
|
|
163
|
+
markdown = document.export_to_markdown()
|
|
164
|
+
result = pre_fn(markdown)
|
|
165
|
+
if inspect.isawaitable(result):
|
|
166
|
+
result = await result # type: ignore[assignment]
|
|
167
|
+
processed_markdown = result
|
|
168
|
+
if not isinstance(processed_markdown, str):
|
|
169
|
+
raise ValueError("Preprocessor must return a markdown string")
|
|
170
|
+
processed_document = text_to_docling_document(
|
|
171
|
+
processed_markdown, name="content.md"
|
|
172
|
+
)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.warning(
|
|
175
|
+
f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
chunk_texts = await chunker.chunk(processed_document)
|
|
156
179
|
|
|
157
180
|
embeddings = await self.embedder.embed(chunk_texts)
|
|
158
181
|
|
|
@@ -171,44 +171,64 @@ class DocumentRepository:
|
|
|
171
171
|
chunks: list["Chunk"] | None = None,
|
|
172
172
|
) -> Document:
|
|
173
173
|
"""Create a document with its chunks and embeddings."""
|
|
174
|
+
# Snapshot table versions for versioned rollback (if supported)
|
|
175
|
+
versions = self.store.current_table_versions()
|
|
176
|
+
|
|
174
177
|
# Create the document
|
|
175
178
|
created_doc = await self.create(entity)
|
|
176
179
|
|
|
177
|
-
#
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
180
|
+
# Attempt to create chunks; on failure, prefer version rollback
|
|
181
|
+
try:
|
|
182
|
+
# Create chunks if not provided
|
|
183
|
+
if chunks is None:
|
|
184
|
+
assert created_doc.id is not None, (
|
|
185
|
+
"Document ID should not be None after creation"
|
|
186
|
+
)
|
|
187
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
188
|
+
created_doc.id, docling_document
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
# Use provided chunks, set order from list position
|
|
192
|
+
assert created_doc.id is not None, (
|
|
193
|
+
"Document ID should not be None after creation"
|
|
194
|
+
)
|
|
195
|
+
for order, chunk in enumerate(chunks):
|
|
196
|
+
chunk.document_id = created_doc.id
|
|
197
|
+
chunk.metadata["order"] = order
|
|
198
|
+
await self.chunk_repository.create(chunk)
|
|
199
|
+
|
|
200
|
+
return created_doc
|
|
201
|
+
except Exception:
|
|
202
|
+
# Roll back to the captured versions and re-raise
|
|
203
|
+
self.store.restore_table_versions(versions)
|
|
204
|
+
raise
|
|
196
205
|
|
|
197
206
|
async def _update_with_docling(
|
|
198
207
|
self, entity: Document, docling_document: DoclingDocument
|
|
199
208
|
) -> Document:
|
|
200
209
|
"""Update a document and regenerate its chunks."""
|
|
201
|
-
# Delete existing chunks
|
|
202
210
|
assert entity.id is not None, "Document ID is required for update"
|
|
211
|
+
|
|
212
|
+
# Snapshot table versions for versioned rollback
|
|
213
|
+
versions = self.store.current_table_versions()
|
|
214
|
+
|
|
215
|
+
# Delete existing chunks before writing new ones
|
|
203
216
|
await self.chunk_repository.delete_by_document_id(entity.id)
|
|
204
217
|
|
|
205
|
-
|
|
206
|
-
|
|
218
|
+
try:
|
|
219
|
+
# Update the document
|
|
220
|
+
updated_doc = await self.update(entity)
|
|
207
221
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
222
|
+
# Create new chunks
|
|
223
|
+
assert updated_doc.id is not None, (
|
|
224
|
+
"Document ID should not be None after update"
|
|
225
|
+
)
|
|
226
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
227
|
+
updated_doc.id, docling_document
|
|
228
|
+
)
|
|
213
229
|
|
|
214
|
-
|
|
230
|
+
return updated_doc
|
|
231
|
+
except Exception:
|
|
232
|
+
# Roll back to the captured versions and re-raise
|
|
233
|
+
self.store.restore_table_versions(versions)
|
|
234
|
+
raise
|
|
@@ -84,10 +84,15 @@ class SettingsRepository:
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
if existing:
|
|
87
|
-
#
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
# Only update when configuration actually changed to avoid needless new versions
|
|
88
|
+
existing_payload = (
|
|
89
|
+
json.loads(existing[0].settings) if existing[0].settings else {}
|
|
90
90
|
)
|
|
91
|
+
if existing_payload != current_config:
|
|
92
|
+
self.store.settings_table.update(
|
|
93
|
+
where="id = 'settings'",
|
|
94
|
+
values={"settings": json.dumps(current_config)},
|
|
95
|
+
)
|
|
91
96
|
else:
|
|
92
97
|
# Create new settings
|
|
93
98
|
settings_record = SettingsRecord(
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import importlib
|
|
3
|
+
import importlib.util
|
|
2
4
|
import sys
|
|
3
5
|
from collections.abc import Callable
|
|
4
6
|
from functools import wraps
|
|
5
7
|
from importlib import metadata
|
|
6
8
|
from io import BytesIO
|
|
7
9
|
from pathlib import Path
|
|
10
|
+
from types import ModuleType
|
|
8
11
|
|
|
9
12
|
import httpx
|
|
10
13
|
from docling.document_converter import DocumentConverter
|
|
@@ -106,3 +109,54 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
|
|
|
106
109
|
converter = DocumentConverter()
|
|
107
110
|
result = converter.convert(doc_stream)
|
|
108
111
|
return result.document
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def load_callable(path: str):
|
|
115
|
+
"""Load a callable from a dotted path or file path.
|
|
116
|
+
|
|
117
|
+
Supported formats:
|
|
118
|
+
- "package.module:func" or "package.module.func"
|
|
119
|
+
- "path/to/file.py:func"
|
|
120
|
+
|
|
121
|
+
Returns the loaded callable. Raises ValueError on failure.
|
|
122
|
+
"""
|
|
123
|
+
if not path:
|
|
124
|
+
raise ValueError("Empty callable path provided")
|
|
125
|
+
|
|
126
|
+
module_part = None
|
|
127
|
+
func_name = None
|
|
128
|
+
|
|
129
|
+
if ":" in path:
|
|
130
|
+
module_part, func_name = path.split(":", 1)
|
|
131
|
+
else:
|
|
132
|
+
# split by last dot for module.attr
|
|
133
|
+
if "." in path:
|
|
134
|
+
module_part, func_name = path.rsplit(".", 1)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Invalid callable path format. Use 'module:func' or 'module.func' or 'file.py:func'."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Try file path first
|
|
141
|
+
mod: ModuleType | None = None
|
|
142
|
+
module_path = Path(module_part)
|
|
143
|
+
if module_path.suffix == ".py" and module_path.exists():
|
|
144
|
+
spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
|
|
145
|
+
if spec and spec.loader:
|
|
146
|
+
mod = importlib.util.module_from_spec(spec)
|
|
147
|
+
spec.loader.exec_module(mod)
|
|
148
|
+
else:
|
|
149
|
+
# Import as a module path
|
|
150
|
+
try:
|
|
151
|
+
mod = importlib.import_module(module_part)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
raise ValueError(f"Failed to import module '{module_part}': {e}")
|
|
154
|
+
|
|
155
|
+
if not hasattr(mod, func_name):
|
|
156
|
+
raise ValueError(f"Callable '{func_name}' not found in module '{module_part}'")
|
|
157
|
+
func = getattr(mod, func_name)
|
|
158
|
+
if not callable(func):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Attribute '{func_name}' in module '{module_part}' is not callable"
|
|
161
|
+
)
|
|
162
|
+
return func
|