haiku.rag 0.7.7__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

Files changed (78) hide show
  1. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/PKG-INFO +1 -1
  2. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/benchmarks.md +2 -2
  3. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/cli.md +70 -53
  4. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/configuration.md +32 -0
  5. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/index.md +0 -1
  6. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/python.md +18 -0
  7. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/pyproject.toml +2 -1
  8. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/app.py +9 -0
  9. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/cli.py +12 -0
  10. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/client.py +10 -0
  11. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/config.py +4 -0
  12. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/logging.py +3 -0
  13. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/migration.py +3 -3
  14. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/engine.py +33 -6
  15. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/chunk.py +24 -1
  16. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/document.py +48 -28
  17. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/settings.py +8 -3
  18. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/utils.py +54 -0
  19. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/generate_benchmark_db.py +6 -1
  20. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_client.py +91 -95
  21. haiku_rag-0.8.1/tests/test_preprocessor.py +71 -0
  22. haiku_rag-0.8.1/tests/test_versioning.py +94 -0
  23. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/uv.lock +175 -1
  24. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/FUNDING.yml +0 -0
  25. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/workflows/build-docs.yml +0 -0
  26. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.github/workflows/build-publish.yml +0 -0
  27. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.gitignore +0 -0
  28. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.pre-commit-config.yaml +0 -0
  29. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/.python-version +0 -0
  30. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/LICENSE +0 -0
  31. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/README.md +0 -0
  32. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/installation.md +0 -0
  33. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/mcp.md +0 -0
  34. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/docs/server.md +0 -0
  35. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/mkdocs.yml +0 -0
  36. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/__init__.py +0 -0
  37. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/chunker.py +0 -0
  38. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/__init__.py +0 -0
  39. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/base.py +0 -0
  40. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/ollama.py +0 -0
  41. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/openai.py +0 -0
  42. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/vllm.py +0 -0
  43. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/voyageai.py +0 -0
  44. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/mcp.py +0 -0
  45. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/monitor.py +0 -0
  46. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/__init__.py +0 -0
  47. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/agent.py +0 -0
  48. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/qa/prompts.py +0 -0
  49. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reader.py +0 -0
  50. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/__init__.py +0 -0
  51. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/base.py +0 -0
  52. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/cohere.py +0 -0
  53. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/mxbai.py +0 -0
  54. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/reranking/vllm.py +0 -0
  55. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/__init__.py +0 -0
  56. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/__init__.py +0 -0
  57. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/chunk.py +0 -0
  58. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/models/document.py +0 -0
  59. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/__init__.py +0 -0
  60. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/src/haiku/rag/store/upgrades/__init__.py +0 -0
  61. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/__init__.py +0 -0
  62. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/conftest.py +0 -0
  63. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/llm_judge.py +0 -0
  64. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_app.py +0 -0
  65. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_chunk.py +0 -0
  66. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_chunker.py +0 -0
  67. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_cli.py +0 -0
  68. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_document.py +0 -0
  69. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_embedder.py +0 -0
  70. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_lancedb_connection.py +0 -0
  71. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_monitor.py +0 -0
  72. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_qa.py +0 -0
  73. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_reader.py +0 -0
  74. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_rebuild.py +0 -0
  75. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_reranker.py +0 -0
  76. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_search.py +0 -0
  77. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_settings.py +0 -0
  78. {haiku_rag-0.7.7 → haiku_rag-0.8.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.7.7
3
+ Version: 0.8.1
4
4
  Summary: Retrieval Augmented Generation (RAG) with LanceDB
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -16,8 +16,8 @@ The recall obtained is ~0.79 for matching in the top result, raising to ~0.91 fo
16
16
  |---------------------------------------|-------------------|-------------------|------------------------|
17
17
  | Ollama / `mxbai-embed-large` | 0.79 | 0.91 | None |
18
18
  | Ollama / `mxbai-embed-large` | 0.90 | 0.95 | `mxbai-rerank-base-v2` |
19
- <!-- | Ollama / `nomic-embed-text` | 0.74 | 0.88 | None |
20
- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
19
+ | Ollama / `nomic-embed-text-v1.5` | 0.74 | 0.90 | None |
20
+ <!-- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
21
21
  | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
22
22
  | OpenAI / `text-embeddings-3-small` | 0.83 | 0.90 | Cohere / `rerank-v3.5` | -->
23
23
 
@@ -2,22 +2,17 @@
2
2
 
3
3
  The `haiku-rag` CLI provides complete document management functionality.
4
4
 
5
- ## Shell Autocompletion
6
-
7
- Enable shell autocompletion for faster, error‑free usage.
5
+ !!! note
6
+ All commands support:
8
7
 
9
- - Temporary (current shell only):
10
- ```bash
11
- eval "$(haiku-rag --show-completion)"
12
- ```
13
- - Permanent installation:
14
- ```bash
15
- haiku-rag --install-completion
16
- ```
8
+ - `--db` - Specify custom database path
9
+ - `-h` - Show help for specific command
17
10
 
18
- What’s completed:
19
- - `get` and `delete`/`rm`: Document IDs from the selected database (respects `--db`).
20
- - `add-src`: Local filesystem paths (URLs can still be typed manually).
11
+ Example:
12
+ ```bash
13
+ haiku-rag list --db /path/to/custom.db
14
+ haiku-rag add -h
15
+ ```
21
16
 
22
17
  ## Document Management
23
18
 
@@ -40,6 +35,12 @@ haiku-rag add-src /path/to/document.pdf
40
35
  haiku-rag add-src https://example.com/article.html
41
36
  ```
42
37
 
38
+ !!! note
39
+ As you add documents to `haiku.rag` the database keeps growing. By default, LanceDB supports versioning
40
+ of your data. Create/update operations are atomic‑feeling: if anything fails during chunking or embedding,
41
+ the database rolls back to the pre‑operation snapshot using LanceDB table versioning. You can optimize and
42
+ compact the database by running the [vacuum](#vacuum-optimize-and-cleanup) command.
43
+
43
44
  ### Get Document
44
45
 
45
46
  ```bash
@@ -55,33 +56,8 @@ haiku-rag delete <TAB>
55
56
  haiku-rag rm <TAB> # alias
56
57
  ```
57
58
 
58
- ### Rebuild Database
59
-
60
- Rebuild the database by deleting all chunks & embeddings and re-indexing all documents:
61
-
62
- ```bash
63
- haiku-rag rebuild
64
- ```
65
-
66
59
  Use this when you want to change things like the embedding model or chunk size for example.
67
60
 
68
- ## Migration
69
-
70
- ### Migrate from SQLite to LanceDB
71
-
72
- Migrate an existing SQLite database to LanceDB:
73
-
74
- ```bash
75
- haiku-rag migrate /path/to/old_database.sqlite
76
- ```
77
-
78
- This will:
79
- - Read all documents, chunks, embeddings, and settings from the SQLite database
80
- - Create a new LanceDB database with the same data in the same directory
81
- - Optimize the new database for best performance
82
-
83
- The original SQLite database remains unchanged, so you can safely migrate without risk of data loss.
84
-
85
61
  ## Search
86
62
 
87
63
  Basic search:
@@ -108,13 +84,6 @@ haiku-rag ask "Who is the author of haiku.rag?" --cite
108
84
 
109
85
  The QA agent will search your documents for relevant information and provide a comprehensive answer. With `--cite`, responses include citations showing which documents were used.
110
86
 
111
- ## Configuration
112
-
113
- View current configuration settings:
114
- ```bash
115
- haiku-rag settings
116
- ```
117
-
118
87
  ## Server
119
88
 
120
89
  Start the MCP server:
@@ -129,14 +98,62 @@ haiku-rag serve --stdio
129
98
  haiku-rag serve --sse
130
99
  ```
131
100
 
132
- ## Options
101
+ ## Settings
133
102
 
134
- All commands support:
135
- - `--db` - Specify custom database path
136
- - `-h` - Show help for specific command
103
+ View current configuration settings:
104
+ ```bash
105
+ haiku-rag settings
106
+ ```
107
+
108
+ ## Maintenance
109
+
110
+ ### Vacuum (Optimize and Cleanup)
111
+
112
+ Reduce disk usage by optimizing and pruning old table versions across all tables:
113
+
114
+ ```bash
115
+ haiku-rag vacuum
116
+ ```
117
+
118
+ ### Rebuild Database
119
+
120
+ Rebuild the database by deleting all chunks & embeddings and re-indexing all documents. This is useful
121
+ when want to switch embeddings provider or model:
137
122
 
138
- Example:
139
123
  ```bash
140
- haiku-rag list --db /path/to/custom.db
141
- haiku-rag add -h
124
+ haiku-rag rebuild
142
125
  ```
126
+
127
+ ## Migration
128
+
129
+ ### Migrate from SQLite to LanceDB
130
+
131
+ Migrate an existing SQLite database to LanceDB:
132
+
133
+ ```bash
134
+ haiku-rag migrate /path/to/old_database.sqlite
135
+ ```
136
+
137
+ This will:
138
+ - Read all documents, chunks, embeddings, and settings from the SQLite database
139
+ - Create a new LanceDB database with the same data in the same directory
140
+ - Optimize the new database for best performance
141
+
142
+ The original SQLite database remains unchanged, so you can safely migrate without risk of data loss.
143
+
144
+ ## Shell Autocompletion
145
+
146
+ Enable shell autocompletion for faster, error‑free usage.
147
+
148
+ - Temporary (current shell only):
149
+ ```bash
150
+ eval "$(haiku-rag --show-completion)"
151
+ ```
152
+ - Permanent installation:
153
+ ```bash
154
+ haiku-rag --install-completion
155
+ ```
156
+
157
+ What’s completed:
158
+ - `get` and `delete`/`rm`: Document IDs from the selected database (respects `--db`).
159
+ - `add-src`: Local filesystem paths (URLs can still be typed manually).
@@ -223,3 +223,35 @@ CHUNK_SIZE=256
223
223
  # into single chunks with continuous content to eliminate duplication
224
224
  CONTEXT_CHUNK_RADIUS=0
225
225
  ```
226
+
227
+ #### Markdown Preprocessor
228
+
229
+ Optionally preprocess Markdown before chunking by pointing to a callable that receives and returns Markdown text. This is useful for normalizing content, stripping boilerplate, or applying custom transformations before chunk boundaries are computed.
230
+
231
+ ```bash
232
+ # A callable path in one of these formats:
233
+ # - package.module:func
234
+ # - package.module.func
235
+ # - /abs/or/relative/path/to/file.py:func
236
+ MARKDOWN_PREPROCESSOR="my_pkg.preprocess:clean_md"
237
+ ```
238
+
239
+ !!! note
240
+ - The function signature should be `def clean_md(text: str) -> str` or `async def clean_md(text: str) -> str`.
241
+ - If the function raises or returns a non-string, haiku.rag logs a warning and proceeds without preprocessing.
242
+ - The preprocessor affects only the chunking pipeline. The stored document content remains unchanged.
243
+
244
+ Example implementation:
245
+
246
+ ```python
247
+ # my_pkg/preprocess.py
248
+ def clean_md(text: str) -> str:
249
+ # strip HTML comments and collapse multiple blank lines
250
+ lines = [line for line in text.splitlines() if not line.strip().startswith("<!--")]
251
+ out = []
252
+ for line in lines:
253
+ if line.strip() == "" and (out and out[-1] == ""):
254
+ continue
255
+ out.append(line)
256
+ return "\n".join(out)
257
+ ```
@@ -52,7 +52,6 @@ haiku-rag migrate old_database.sqlite # Migrate from SQLite
52
52
  - [Installation](installation.md) - Install haiku.rag with different providers
53
53
  - [Configuration](configuration.md) - Environment variables and settings
54
54
  - [CLI](cli.md) - Command line interface usage
55
- - [Question Answering](qa.md) - QA agents and natural language queries
56
55
  - [Server](server.md) - File monitoring and server mode
57
56
  - [MCP](mcp.md) - Model Context Protocol integration
58
57
  - [Python](python.md) - Python API reference
@@ -99,6 +99,24 @@ async for doc_id in client.rebuild_database():
99
99
  print(f"Processed document {doc_id}")
100
100
  ```
101
101
 
102
+ ## Maintenance
103
+
104
+ Run maintenance to optimize storage and prune old table versions:
105
+
106
+ ```python
107
+ await client.vacuum()
108
+ ```
109
+
110
+ This compacts tables and removes historical versions to keep disk usage in check. It’s safe to run anytime, for example after bulk imports or periodically in long‑running apps.
111
+
112
+ ### Atomic Writes and Rollback
113
+
114
+ Document create and update operations take a snapshot of table versions before any write and automatically roll back to that snapshot if something fails (for example, during chunking or embedding). This restores both the `documents` and `chunks` tables to their pre‑operation state using LanceDB’s table versioning.
115
+
116
+ - Applies to: `create_document(...)`, `create_document_from_source(...)`, `update_document(...)`, and internal rebuild/update flows.
117
+ - Scope: Both document rows and all associated chunks are rolled back together.
118
+ - Vacuum: Running `vacuum()` later prunes old versions for disk efficiency; rollbacks occur immediately during the failing operation and are not impacted.
119
+
102
120
  ## Searching Documents
103
121
 
104
122
  The search method performs native hybrid search (vector + full-text) using LanceDB with optional reranking for improved relevance:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "haiku.rag"
3
- version = "0.7.7"
3
+ version = "0.8.1"
4
4
  description = "Retrieval Augmented Generation (RAG) with LanceDB"
5
5
  authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -53,6 +53,7 @@ packages = ["src/haiku"]
53
53
  [dependency-groups]
54
54
  dev = [
55
55
  "datasets>=3.6.0",
56
+ "logfire>=4.6.0",
56
57
  "mkdocs>=1.6.1",
57
58
  "mkdocs-material>=9.6.14",
58
59
  "pre-commit>=4.2.0",
@@ -102,6 +102,15 @@ class HaikuRAGApp:
102
102
  except Exception as e:
103
103
  self.console.print(f"[red]Error rebuilding database: {e}[/red]")
104
104
 
105
+ async def vacuum(self):
106
+ """Run database maintenance: optimize and cleanup table history."""
107
+ try:
108
+ async with HaikuRAG(db_path=self.db_path, skip_validation=True) as client:
109
+ await client.vacuum()
110
+ self.console.print("[b]Vacuum completed successfully.[/b]")
111
+ except Exception as e:
112
+ self.console.print(f"[red]Error during vacuum: {e}[/red]")
113
+
105
114
  def show_settings(self):
106
115
  """Display current configuration settings."""
107
116
  self.console.print("[bold]haiku.rag configuration[/bold]")
@@ -256,6 +256,18 @@ def rebuild(
256
256
  asyncio.run(app.rebuild())
257
257
 
258
258
 
259
+ @cli.command("vacuum", help="Optimize and clean up all tables to reduce disk usage")
260
+ def vacuum(
261
+ db: Path = typer.Option(
262
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
263
+ "--db",
264
+ help="Path to the LanceDB database file",
265
+ ),
266
+ ):
267
+ app = HaikuRAGApp(db_path=db)
268
+ asyncio.run(app.vacuum())
269
+
270
+
259
271
  @cli.command(
260
272
  "serve", help="Start the haiku.rag MCP server (by default in streamable HTTP mode)"
261
273
  )
@@ -550,6 +550,16 @@ class HaikuRAG:
550
550
  )
551
551
  yield doc.id
552
552
 
553
+ # Final maintenance: centralized vacuum to curb disk usage
554
+ try:
555
+ self.store.vacuum()
556
+ except Exception:
557
+ pass
558
+
559
+ async def vacuum(self) -> None:
560
+ """Optimize and clean up old versions across all tables."""
561
+ self.store.vacuum()
562
+
553
563
  def close(self):
554
564
  """Close the underlying store connection."""
555
565
  self.store.close()
@@ -32,6 +32,10 @@ class AppConfig(BaseModel):
32
32
  CHUNK_SIZE: int = 256
33
33
  CONTEXT_CHUNK_RADIUS: int = 0
34
34
 
35
+ # Optional dotted path or file path to a callable that preprocesses
36
+ # markdown content before chunking. Examples:
37
+ MARKDOWN_PREPROCESSOR: str = ""
38
+
35
39
  OLLAMA_BASE_URL: str = "http://localhost:11434"
36
40
  VLLM_EMBEDDINGS_BASE_URL: str = ""
37
41
  VLLM_RERANK_BASE_URL: str = ""
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
 
3
4
  from rich.console import Console
4
5
  from rich.logging import RichHandler
@@ -50,4 +51,6 @@ def configure_cli_logging(level: int = logging.INFO) -> logging.Logger:
50
51
  logger = get_logger()
51
52
  logger.setLevel(level)
52
53
  logger.propagate = False
54
+
55
+ warnings.filterwarnings("ignore")
53
56
  return logger
@@ -47,7 +47,7 @@ class SQLiteToLanceDBMigrator:
47
47
 
48
48
  # Load the sqlite-vec extension
49
49
  try:
50
- import sqlite_vec
50
+ import sqlite_vec # type: ignore
51
51
 
52
52
  sqlite_conn.enable_load_extension(True)
53
53
  sqlite_vec.load(sqlite_conn)
@@ -91,10 +91,10 @@ class SQLiteToLanceDBMigrator:
91
91
 
92
92
  sqlite_conn.close()
93
93
 
94
- # Optimize the chunks table after migration
94
+ # Optimize and cleanup using centralized vacuum
95
95
  self.console.print("[blue]Optimizing LanceDB...[/blue]")
96
96
  try:
97
- lance_store.chunks_table.optimize()
97
+ lance_store.vacuum()
98
98
  self.console.print("[green]✅ Optimization completed[/green]")
99
99
  except Exception as e:
100
100
  self.console.print(
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from datetime import timedelta
3
4
  from importlib import metadata
4
5
  from pathlib import Path
5
6
  from uuid import uuid4
@@ -62,6 +63,15 @@ class Store:
62
63
  if not skip_validation:
63
64
  self._validate_configuration()
64
65
 
66
+ def vacuum(self) -> None:
67
+ """Optimize and clean up old versions across all tables to reduce disk usage."""
68
+ if self._has_cloud_config() and str(Config.LANCEDB_URI).startswith("db://"):
69
+ return
70
+
71
+ # Perform maintenance per table using optimize() with cleanup_older_than 0
72
+ for table in [self.documents_table, self.chunks_table, self.settings_table]:
73
+ table.optimize(cleanup_older_than=timedelta(0))
74
+
65
75
  def _connect_to_lancedb(self, db_path: Path):
66
76
  """Establish connection to LanceDB (local, cloud, or object storage)."""
67
77
  # Check if we have cloud configuration
@@ -159,16 +169,18 @@ class Store:
159
169
  self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
160
170
  )
161
171
  if settings_records:
162
- settings = (
172
+ # Only write if version actually changes to avoid creating new table versions
173
+ current = (
163
174
  json.loads(settings_records[0].settings)
164
175
  if settings_records[0].settings
165
176
  else {}
166
177
  )
167
- settings["version"] = version
168
- # Update the record
169
- self.settings_table.update(
170
- where="id = 'settings'", values={"settings": json.dumps(settings)}
171
- )
178
+ if current.get("version") != version:
179
+ current["version"] = version
180
+ self.settings_table.update(
181
+ where="id = 'settings'",
182
+ values={"settings": json.dumps(current)},
183
+ )
172
184
  else:
173
185
  # Create new settings record
174
186
  settings_data = Config.model_dump(mode="json")
@@ -197,6 +209,21 @@ class Store:
197
209
  # LanceDB connections are automatically managed
198
210
  pass
199
211
 
212
+ def current_table_versions(self) -> dict[str, int]:
213
+ """Capture current versions of key tables for rollback using LanceDB's API."""
214
+ return {
215
+ "documents": int(self.documents_table.version),
216
+ "chunks": int(self.chunks_table.version),
217
+ "settings": int(self.settings_table.version),
218
+ }
219
+
220
+ def restore_table_versions(self, versions: dict[str, int]) -> bool:
221
+ """Restore tables to the provided versions using LanceDB's API."""
222
+ self.documents_table.restore(int(versions["documents"]))
223
+ self.chunks_table.restore(int(versions["chunks"]))
224
+ self.settings_table.restore(int(versions["settings"]))
225
+ return True
226
+
200
227
  @property
201
228
  def _connection(self):
202
229
  """Compatibility property for repositories expecting _connection."""
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import inspect
2
3
  import json
3
4
  import logging
4
5
  from uuid import uuid4
@@ -11,6 +12,7 @@ from haiku.rag.config import Config
11
12
  from haiku.rag.embeddings import get_embedder
12
13
  from haiku.rag.store.engine import DocumentRecord, Store
13
14
  from haiku.rag.store.models.chunk import Chunk
15
+ from haiku.rag.utils import load_callable, text_to_docling_document
14
16
 
15
17
  logger = logging.getLogger(__name__)
16
18
 
@@ -152,7 +154,28 @@ class ChunkRepository:
152
154
  self, document_id: str, document: DoclingDocument
153
155
  ) -> list[Chunk]:
154
156
  """Create chunks and embeddings for a document from DoclingDocument."""
155
- chunk_texts = await chunker.chunk(document)
157
+ # Optionally preprocess markdown before chunking
158
+ processed_document = document
159
+ preprocessor_path = Config.MARKDOWN_PREPROCESSOR
160
+ if preprocessor_path:
161
+ try:
162
+ pre_fn = load_callable(preprocessor_path)
163
+ markdown = document.export_to_markdown()
164
+ result = pre_fn(markdown)
165
+ if inspect.isawaitable(result):
166
+ result = await result # type: ignore[assignment]
167
+ processed_markdown = result
168
+ if not isinstance(processed_markdown, str):
169
+ raise ValueError("Preprocessor must return a markdown string")
170
+ processed_document = text_to_docling_document(
171
+ processed_markdown, name="content.md"
172
+ )
173
+ except Exception as e:
174
+ logger.warning(
175
+ f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
176
+ )
177
+
178
+ chunk_texts = await chunker.chunk(processed_document)
156
179
 
157
180
  embeddings = await self.embedder.embed(chunk_texts)
158
181
 
@@ -171,44 +171,64 @@ class DocumentRepository:
171
171
  chunks: list["Chunk"] | None = None,
172
172
  ) -> Document:
173
173
  """Create a document with its chunks and embeddings."""
174
+ # Snapshot table versions for versioned rollback (if supported)
175
+ versions = self.store.current_table_versions()
176
+
174
177
  # Create the document
175
178
  created_doc = await self.create(entity)
176
179
 
177
- # Create chunks if not provided
178
- if chunks is None:
179
- assert created_doc.id is not None, (
180
- "Document ID should not be None after creation"
181
- )
182
- await self.chunk_repository.create_chunks_for_document(
183
- created_doc.id, docling_document
184
- )
185
- else:
186
- # Use provided chunks, set order from list position
187
- assert created_doc.id is not None, (
188
- "Document ID should not be None after creation"
189
- )
190
- for order, chunk in enumerate(chunks):
191
- chunk.document_id = created_doc.id
192
- chunk.metadata["order"] = order
193
- await self.chunk_repository.create(chunk)
194
-
195
- return created_doc
180
+ # Attempt to create chunks; on failure, prefer version rollback
181
+ try:
182
+ # Create chunks if not provided
183
+ if chunks is None:
184
+ assert created_doc.id is not None, (
185
+ "Document ID should not be None after creation"
186
+ )
187
+ await self.chunk_repository.create_chunks_for_document(
188
+ created_doc.id, docling_document
189
+ )
190
+ else:
191
+ # Use provided chunks, set order from list position
192
+ assert created_doc.id is not None, (
193
+ "Document ID should not be None after creation"
194
+ )
195
+ for order, chunk in enumerate(chunks):
196
+ chunk.document_id = created_doc.id
197
+ chunk.metadata["order"] = order
198
+ await self.chunk_repository.create(chunk)
199
+
200
+ return created_doc
201
+ except Exception:
202
+ # Roll back to the captured versions and re-raise
203
+ self.store.restore_table_versions(versions)
204
+ raise
196
205
 
197
206
  async def _update_with_docling(
198
207
  self, entity: Document, docling_document: DoclingDocument
199
208
  ) -> Document:
200
209
  """Update a document and regenerate its chunks."""
201
- # Delete existing chunks
202
210
  assert entity.id is not None, "Document ID is required for update"
211
+
212
+ # Snapshot table versions for versioned rollback
213
+ versions = self.store.current_table_versions()
214
+
215
+ # Delete existing chunks before writing new ones
203
216
  await self.chunk_repository.delete_by_document_id(entity.id)
204
217
 
205
- # Update the document
206
- updated_doc = await self.update(entity)
218
+ try:
219
+ # Update the document
220
+ updated_doc = await self.update(entity)
207
221
 
208
- # Create new chunks
209
- assert updated_doc.id is not None, "Document ID should not be None after update"
210
- await self.chunk_repository.create_chunks_for_document(
211
- updated_doc.id, docling_document
212
- )
222
+ # Create new chunks
223
+ assert updated_doc.id is not None, (
224
+ "Document ID should not be None after update"
225
+ )
226
+ await self.chunk_repository.create_chunks_for_document(
227
+ updated_doc.id, docling_document
228
+ )
213
229
 
214
- return updated_doc
230
+ return updated_doc
231
+ except Exception:
232
+ # Roll back to the captured versions and re-raise
233
+ self.store.restore_table_versions(versions)
234
+ raise
@@ -84,10 +84,15 @@ class SettingsRepository:
84
84
  )
85
85
 
86
86
  if existing:
87
- # Update existing settings
88
- self.store.settings_table.update(
89
- where="id = 'settings'", values={"settings": json.dumps(current_config)}
87
+ # Only update when configuration actually changed to avoid needless new versions
88
+ existing_payload = (
89
+ json.loads(existing[0].settings) if existing[0].settings else {}
90
90
  )
91
+ if existing_payload != current_config:
92
+ self.store.settings_table.update(
93
+ where="id = 'settings'",
94
+ values={"settings": json.dumps(current_config)},
95
+ )
91
96
  else:
92
97
  # Create new settings
93
98
  settings_record = SettingsRecord(
@@ -1,10 +1,13 @@
1
1
  import asyncio
2
+ import importlib
3
+ import importlib.util
2
4
  import sys
3
5
  from collections.abc import Callable
4
6
  from functools import wraps
5
7
  from importlib import metadata
6
8
  from io import BytesIO
7
9
  from pathlib import Path
10
+ from types import ModuleType
8
11
 
9
12
  import httpx
10
13
  from docling.document_converter import DocumentConverter
@@ -106,3 +109,54 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
106
109
  converter = DocumentConverter()
107
110
  result = converter.convert(doc_stream)
108
111
  return result.document
112
+
113
+
114
+ def load_callable(path: str):
115
+ """Load a callable from a dotted path or file path.
116
+
117
+ Supported formats:
118
+ - "package.module:func" or "package.module.func"
119
+ - "path/to/file.py:func"
120
+
121
+ Returns the loaded callable. Raises ValueError on failure.
122
+ """
123
+ if not path:
124
+ raise ValueError("Empty callable path provided")
125
+
126
+ module_part = None
127
+ func_name = None
128
+
129
+ if ":" in path:
130
+ module_part, func_name = path.split(":", 1)
131
+ else:
132
+ # split by last dot for module.attr
133
+ if "." in path:
134
+ module_part, func_name = path.rsplit(".", 1)
135
+ else:
136
+ raise ValueError(
137
+ "Invalid callable path format. Use 'module:func' or 'module.func' or 'file.py:func'."
138
+ )
139
+
140
+ # Try file path first
141
+ mod: ModuleType | None = None
142
+ module_path = Path(module_part)
143
+ if module_path.suffix == ".py" and module_path.exists():
144
+ spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
145
+ if spec and spec.loader:
146
+ mod = importlib.util.module_from_spec(spec)
147
+ spec.loader.exec_module(mod)
148
+ else:
149
+ # Import as a module path
150
+ try:
151
+ mod = importlib.import_module(module_part)
152
+ except Exception as e:
153
+ raise ValueError(f"Failed to import module '{module_part}': {e}")
154
+
155
+ if not hasattr(mod, func_name):
156
+ raise ValueError(f"Callable '{func_name}' not found in module '{module_part}'")
157
+ func = getattr(mod, func_name)
158
+ if not callable(func):
159
+ raise ValueError(
160
+ f"Attribute '{func_name}' in module '{module_part}' is not callable"
161
+ )
162
+ return func