haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,16 +1,14 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import json
|
|
3
2
|
from datetime import datetime
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
5
3
|
from uuid import uuid4
|
|
6
4
|
|
|
7
5
|
from haiku.rag.store.engine import DocumentRecord, Store
|
|
8
6
|
from haiku.rag.store.models.document import Document
|
|
9
7
|
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from docling_core.types.doc.document import DoclingDocument
|
|
12
8
|
|
|
13
|
-
|
|
9
|
+
def _escape_sql_string(value: str) -> str:
|
|
10
|
+
"""Escape single quotes in SQL string literals."""
|
|
11
|
+
return value.replace("'", "''")
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
class DocumentRepository:
|
|
@@ -37,6 +35,8 @@ class DocumentRepository:
|
|
|
37
35
|
uri=record.uri,
|
|
38
36
|
title=record.title,
|
|
39
37
|
metadata=json.loads(record.metadata),
|
|
38
|
+
docling_document_json=record.docling_document_json,
|
|
39
|
+
docling_version=record.docling_version,
|
|
40
40
|
created_at=datetime.fromisoformat(record.created_at)
|
|
41
41
|
if record.created_at
|
|
42
42
|
else datetime.now(),
|
|
@@ -47,6 +47,7 @@ class DocumentRepository:
|
|
|
47
47
|
|
|
48
48
|
async def create(self, entity: Document) -> Document:
|
|
49
49
|
"""Create a document in the database."""
|
|
50
|
+
self.store._assert_writable()
|
|
50
51
|
# Generate new UUID
|
|
51
52
|
doc_id = str(uuid4())
|
|
52
53
|
|
|
@@ -60,6 +61,8 @@ class DocumentRepository:
|
|
|
60
61
|
uri=entity.uri,
|
|
61
62
|
title=entity.title,
|
|
62
63
|
metadata=json.dumps(entity.metadata),
|
|
64
|
+
docling_document_json=entity.docling_document_json,
|
|
65
|
+
docling_version=entity.docling_version,
|
|
63
66
|
created_at=now,
|
|
64
67
|
updated_at=now,
|
|
65
68
|
)
|
|
@@ -88,8 +91,14 @@ class DocumentRepository:
|
|
|
88
91
|
|
|
89
92
|
async def update(self, entity: Document) -> Document:
|
|
90
93
|
"""Update an existing document."""
|
|
94
|
+
self.store._assert_writable()
|
|
95
|
+
from haiku.rag.store.models.document import invalidate_docling_document_cache
|
|
96
|
+
|
|
91
97
|
assert entity.id, "Document ID is required for update"
|
|
92
98
|
|
|
99
|
+
# Invalidate cache before update
|
|
100
|
+
invalidate_docling_document_cache(entity.id)
|
|
101
|
+
|
|
93
102
|
# Update timestamp
|
|
94
103
|
now = datetime.now().isoformat()
|
|
95
104
|
entity.updated_at = datetime.fromisoformat(now)
|
|
@@ -102,6 +111,8 @@ class DocumentRepository:
|
|
|
102
111
|
"uri": entity.uri,
|
|
103
112
|
"title": entity.title,
|
|
104
113
|
"metadata": json.dumps(entity.metadata),
|
|
114
|
+
"docling_document_json": entity.docling_document_json,
|
|
115
|
+
"docling_version": entity.docling_version,
|
|
105
116
|
"updated_at": now,
|
|
106
117
|
},
|
|
107
118
|
)
|
|
@@ -110,11 +121,17 @@ class DocumentRepository:
|
|
|
110
121
|
|
|
111
122
|
async def delete(self, entity_id: str) -> bool:
|
|
112
123
|
"""Delete a document by its ID."""
|
|
124
|
+
self.store._assert_writable()
|
|
125
|
+
from haiku.rag.store.models.document import invalidate_docling_document_cache
|
|
126
|
+
|
|
113
127
|
# Check if document exists
|
|
114
128
|
doc = await self.get_by_id(entity_id)
|
|
115
129
|
if doc is None:
|
|
116
130
|
return False
|
|
117
131
|
|
|
132
|
+
# Invalidate cache before delete
|
|
133
|
+
invalidate_docling_document_cache(entity_id)
|
|
134
|
+
|
|
118
135
|
# Delete associated chunks first
|
|
119
136
|
await self.chunk_repository.delete_by_document_id(entity_id)
|
|
120
137
|
|
|
@@ -152,9 +169,10 @@ class DocumentRepository:
|
|
|
152
169
|
|
|
153
170
|
async def get_by_uri(self, uri: str) -> Document | None:
|
|
154
171
|
"""Get a document by its URI."""
|
|
172
|
+
escaped_uri = _escape_sql_string(uri)
|
|
155
173
|
results = list(
|
|
156
174
|
self.store.documents_table.search()
|
|
157
|
-
.where(f"uri = '{
|
|
175
|
+
.where(f"uri = '{escaped_uri}'")
|
|
158
176
|
.limit(1)
|
|
159
177
|
.to_pydantic(DocumentRecord)
|
|
160
178
|
)
|
|
@@ -166,6 +184,7 @@ class DocumentRepository:
|
|
|
166
184
|
|
|
167
185
|
async def delete_all(self) -> None:
|
|
168
186
|
"""Delete all documents from the database."""
|
|
187
|
+
self.store._assert_writable()
|
|
169
188
|
# Delete all chunks first
|
|
170
189
|
await self.chunk_repository.delete_all()
|
|
171
190
|
|
|
@@ -181,81 +200,3 @@ class DocumentRepository:
|
|
|
181
200
|
self.store.documents_table = self.store.db.create_table(
|
|
182
201
|
"documents", schema=DocumentRecord
|
|
183
202
|
)
|
|
184
|
-
|
|
185
|
-
async def _create_and_chunk(
|
|
186
|
-
self,
|
|
187
|
-
entity: Document,
|
|
188
|
-
docling_document: "DoclingDocument | None",
|
|
189
|
-
chunks: list["Chunk"] | None = None,
|
|
190
|
-
) -> Document:
|
|
191
|
-
"""Create a document with its chunks and embeddings."""
|
|
192
|
-
# Snapshot table versions for versioned rollback (if supported)
|
|
193
|
-
versions = self.store.current_table_versions()
|
|
194
|
-
|
|
195
|
-
# Create the document
|
|
196
|
-
created_doc = await self.create(entity)
|
|
197
|
-
|
|
198
|
-
# Attempt to create chunks; on failure, prefer version rollback
|
|
199
|
-
try:
|
|
200
|
-
# Create chunks if not provided
|
|
201
|
-
if chunks is None:
|
|
202
|
-
assert docling_document is not None, (
|
|
203
|
-
"docling_document is required when chunks are not provided"
|
|
204
|
-
)
|
|
205
|
-
assert created_doc.id is not None, (
|
|
206
|
-
"Document ID should not be None after creation"
|
|
207
|
-
)
|
|
208
|
-
await self.chunk_repository.create_chunks_for_document(
|
|
209
|
-
created_doc.id, docling_document
|
|
210
|
-
)
|
|
211
|
-
else:
|
|
212
|
-
# Use provided chunks, set order from list position
|
|
213
|
-
assert created_doc.id is not None, (
|
|
214
|
-
"Document ID should not be None after creation"
|
|
215
|
-
)
|
|
216
|
-
for order, chunk in enumerate(chunks):
|
|
217
|
-
chunk.document_id = created_doc.id
|
|
218
|
-
chunk.order = order
|
|
219
|
-
await self.chunk_repository.create(chunk)
|
|
220
|
-
|
|
221
|
-
# Vacuum old versions in background (non-blocking)
|
|
222
|
-
asyncio.create_task(self.store.vacuum())
|
|
223
|
-
|
|
224
|
-
return created_doc
|
|
225
|
-
except Exception:
|
|
226
|
-
# Roll back to the captured versions and re-raise
|
|
227
|
-
self.store.restore_table_versions(versions)
|
|
228
|
-
raise
|
|
229
|
-
|
|
230
|
-
async def _update_and_rechunk(
|
|
231
|
-
self, entity: Document, docling_document: "DoclingDocument"
|
|
232
|
-
) -> Document:
|
|
233
|
-
"""Update a document and regenerate its chunks."""
|
|
234
|
-
assert entity.id is not None, "Document ID is required for update"
|
|
235
|
-
|
|
236
|
-
# Snapshot table versions for versioned rollback
|
|
237
|
-
versions = self.store.current_table_versions()
|
|
238
|
-
|
|
239
|
-
# Delete existing chunks before writing new ones
|
|
240
|
-
await self.chunk_repository.delete_by_document_id(entity.id)
|
|
241
|
-
|
|
242
|
-
try:
|
|
243
|
-
# Update the document
|
|
244
|
-
updated_doc = await self.update(entity)
|
|
245
|
-
|
|
246
|
-
# Create new chunks
|
|
247
|
-
assert updated_doc.id is not None, (
|
|
248
|
-
"Document ID should not be None after update"
|
|
249
|
-
)
|
|
250
|
-
await self.chunk_repository.create_chunks_for_document(
|
|
251
|
-
updated_doc.id, docling_document
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
# Vacuum old versions in background (non-blocking)
|
|
255
|
-
asyncio.create_task(self.store.vacuum())
|
|
256
|
-
|
|
257
|
-
return updated_doc
|
|
258
|
-
except Exception:
|
|
259
|
-
# Roll back to the captured versions and re-raise
|
|
260
|
-
self.store.restore_table_versions(versions)
|
|
261
|
-
raise
|
|
@@ -72,6 +72,7 @@ class SettingsRepository:
|
|
|
72
72
|
|
|
73
73
|
def save_current_settings(self) -> None:
|
|
74
74
|
"""Save the current configuration to the database."""
|
|
75
|
+
self.store._assert_writable()
|
|
75
76
|
current_config = self.store._config.model_dump(mode="json")
|
|
76
77
|
|
|
77
78
|
# Check if settings exist
|
|
@@ -118,25 +119,21 @@ class SettingsRepository:
|
|
|
118
119
|
current_config = self.store._config.model_dump(mode="json")
|
|
119
120
|
|
|
120
121
|
# Check if embedding provider or model has changed
|
|
121
|
-
#
|
|
122
|
+
# Both stored and current use nested structure: embeddings.model.{provider,name,vector_dim}
|
|
122
123
|
stored_embeddings = stored_settings.get("embeddings", {})
|
|
123
124
|
current_embeddings = current_config.get("embeddings", {})
|
|
124
125
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
"EMBEDDINGS_PROVIDER"
|
|
128
|
-
)
|
|
129
|
-
current_provider = current_embeddings.get("provider")
|
|
126
|
+
stored_model_obj = stored_embeddings.get("model", {})
|
|
127
|
+
current_model_obj = current_embeddings.get("model", {})
|
|
130
128
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
)
|
|
134
|
-
current_model = current_embeddings.get("model")
|
|
129
|
+
stored_provider = stored_model_obj.get("provider")
|
|
130
|
+
current_provider = current_model_obj.get("provider")
|
|
135
131
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
132
|
+
stored_model = stored_model_obj.get("name")
|
|
133
|
+
current_model = current_model_obj.get("name")
|
|
134
|
+
|
|
135
|
+
stored_vector_dim = stored_model_obj.get("vector_dim")
|
|
136
|
+
current_vector_dim = current_model_obj.get("vector_dim")
|
|
140
137
|
|
|
141
138
|
# Check for incompatible changes
|
|
142
139
|
incompatible_changes = []
|
|
@@ -53,10 +53,26 @@ def run_pending_upgrades(store: Store, from_version: str, to_version: str) -> No
|
|
|
53
53
|
logger.info("Completed upgrade %s", step.version)
|
|
54
54
|
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
from .
|
|
56
|
+
# Import upgrade modules AFTER Upgrade class is defined to avoid circular imports
|
|
57
|
+
# ruff: noqa: E402, I001
|
|
58
|
+
from haiku.rag.store.upgrades.v0_9_3 import upgrade_fts_phrase as upgrade_0_9_3_fts
|
|
59
|
+
from haiku.rag.store.upgrades.v0_9_3 import upgrade_order as upgrade_0_9_3_order
|
|
60
|
+
from haiku.rag.store.upgrades.v0_10_1 import (
|
|
61
|
+
upgrade_add_title as upgrade_0_10_1_add_title,
|
|
62
|
+
)
|
|
63
|
+
from haiku.rag.store.upgrades.v0_19_6 import (
|
|
64
|
+
upgrade_embeddings_model_config as upgrade_0_19_6_embeddings,
|
|
65
|
+
)
|
|
66
|
+
from haiku.rag.store.upgrades.v0_20_0 import (
|
|
67
|
+
upgrade_add_docling_document as upgrade_0_20_0_docling,
|
|
68
|
+
)
|
|
69
|
+
from haiku.rag.store.upgrades.v0_23_1 import (
|
|
70
|
+
upgrade_contextualize_chunks as upgrade_0_23_1_contextualize,
|
|
71
|
+
)
|
|
59
72
|
|
|
60
73
|
upgrades.append(upgrade_0_9_3_order)
|
|
61
74
|
upgrades.append(upgrade_0_9_3_fts)
|
|
62
75
|
upgrades.append(upgrade_0_10_1_add_title)
|
|
76
|
+
upgrades.append(upgrade_0_19_6_embeddings)
|
|
77
|
+
upgrades.append(upgrade_0_20_0_docling)
|
|
78
|
+
upgrades.append(upgrade_0_23_1_contextualize)
|
|
@@ -7,7 +7,7 @@ from haiku.rag.store.engine import Store
|
|
|
7
7
|
from haiku.rag.store.upgrades import Upgrade
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def _apply_add_document_title(store: Store) -> None:
|
|
10
|
+
def _apply_add_document_title(store: Store) -> None: # pragma: no cover
|
|
11
11
|
"""Add a nullable 'title' column to the documents table."""
|
|
12
12
|
|
|
13
13
|
# Read existing rows using Arrow for schema-agnostic access
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from haiku.rag.store.engine import SettingsRecord, Store
|
|
5
|
+
from haiku.rag.store.upgrades import Upgrade
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _apply_embeddings_model_config(store: Store) -> None: # pragma: no cover
|
|
11
|
+
"""Migrate embeddings config from flat to nested EmbeddingModelConfig structure."""
|
|
12
|
+
results = list(
|
|
13
|
+
store.settings_table.search()
|
|
14
|
+
.where("id = 'settings'")
|
|
15
|
+
.limit(1)
|
|
16
|
+
.to_pydantic(SettingsRecord)
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if not results or not results[0].settings:
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
settings = json.loads(results[0].settings)
|
|
23
|
+
embeddings = settings.get("embeddings", {})
|
|
24
|
+
|
|
25
|
+
# Check if already migrated (model is a dict with nested structure)
|
|
26
|
+
if isinstance(embeddings.get("model"), dict):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
# Migrate from flat structure to nested EmbeddingModelConfig
|
|
30
|
+
old_provider = embeddings.get("provider", "ollama")
|
|
31
|
+
old_model = embeddings.get("model", "qwen3-embedding:4b")
|
|
32
|
+
old_vector_dim = embeddings.get("vector_dim", 2560)
|
|
33
|
+
|
|
34
|
+
logger.info(
|
|
35
|
+
"Migrating embeddings config to new nested structure: "
|
|
36
|
+
"embeddings.{provider,model,vector_dim} -> embeddings.model.{provider,name,vector_dim}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Create new nested structure
|
|
40
|
+
settings["embeddings"] = {
|
|
41
|
+
"model": {
|
|
42
|
+
"provider": old_provider,
|
|
43
|
+
"name": old_model,
|
|
44
|
+
"vector_dim": old_vector_dim,
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
store.settings_table.update(
|
|
49
|
+
where="id = 'settings'",
|
|
50
|
+
values={"settings": json.dumps(settings)},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
logger.info(
|
|
54
|
+
"Embeddings config migrated: provider=%s, name=%s, vector_dim=%d",
|
|
55
|
+
old_provider,
|
|
56
|
+
old_model,
|
|
57
|
+
old_vector_dim,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
upgrade_embeddings_model_config = Upgrade(
|
|
62
|
+
version="0.19.6",
|
|
63
|
+
apply=_apply_embeddings_model_config,
|
|
64
|
+
description="Migrate embeddings config to nested EmbeddingModelConfig structure",
|
|
65
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from lancedb.pydantic import LanceModel
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from haiku.rag.store.engine import Store
|
|
7
|
+
from haiku.rag.store.upgrades import Upgrade
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _apply_add_docling_document_columns(store: Store) -> None: # pragma: no cover
|
|
11
|
+
"""Add 'docling_document_json' and 'docling_version' columns to documents table."""
|
|
12
|
+
|
|
13
|
+
# Read existing rows using Arrow for schema-agnostic access
|
|
14
|
+
try:
|
|
15
|
+
docs_arrow = store.documents_table.search().to_arrow()
|
|
16
|
+
rows = docs_arrow.to_pylist()
|
|
17
|
+
except Exception:
|
|
18
|
+
rows = []
|
|
19
|
+
|
|
20
|
+
class DocumentRecordV3(LanceModel):
|
|
21
|
+
id: str
|
|
22
|
+
content: str
|
|
23
|
+
uri: str | None = None
|
|
24
|
+
title: str | None = None
|
|
25
|
+
metadata: str = Field(default="{}")
|
|
26
|
+
docling_document_json: str | None = None
|
|
27
|
+
docling_version: str | None = None
|
|
28
|
+
created_at: str = Field(default_factory=lambda: "")
|
|
29
|
+
updated_at: str = Field(default_factory=lambda: "")
|
|
30
|
+
|
|
31
|
+
# Drop and recreate documents table with the new schema
|
|
32
|
+
try:
|
|
33
|
+
store.db.drop_table("documents")
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
store.documents_table = store.db.create_table("documents", schema=DocumentRecordV3)
|
|
38
|
+
|
|
39
|
+
# Reinsert previous rows with new columns as None
|
|
40
|
+
if rows:
|
|
41
|
+
backfilled = []
|
|
42
|
+
for row in rows:
|
|
43
|
+
backfilled.append(
|
|
44
|
+
DocumentRecordV3(
|
|
45
|
+
id=row.get("id"),
|
|
46
|
+
content=row.get("content", ""),
|
|
47
|
+
uri=row.get("uri"),
|
|
48
|
+
title=row.get("title"),
|
|
49
|
+
metadata=(
|
|
50
|
+
row.get("metadata")
|
|
51
|
+
if isinstance(row.get("metadata"), str)
|
|
52
|
+
else json.dumps(row.get("metadata") or {})
|
|
53
|
+
),
|
|
54
|
+
docling_document_json=None,
|
|
55
|
+
docling_version=None,
|
|
56
|
+
created_at=row.get("created_at", ""),
|
|
57
|
+
updated_at=row.get("updated_at", ""),
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
store.documents_table.add(backfilled)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
upgrade_add_docling_document = Upgrade(
|
|
65
|
+
version="0.20.0",
|
|
66
|
+
apply=_apply_add_docling_document_columns,
|
|
67
|
+
description="Add 'docling_document_json' and 'docling_version' columns to documents table",
|
|
68
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from lancedb.pydantic import LanceModel, Vector
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from haiku.rag.store.engine import Store
|
|
7
|
+
from haiku.rag.store.upgrades import Upgrade
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _apply_add_content_fts(store: Store) -> None: # pragma: no cover
|
|
11
|
+
"""Add content_fts column with contextualized content for better FTS."""
|
|
12
|
+
# Read existing chunks
|
|
13
|
+
try:
|
|
14
|
+
chunks_arrow = store.chunks_table.search().to_arrow()
|
|
15
|
+
rows = chunks_arrow.to_pylist()
|
|
16
|
+
except Exception:
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
if not rows:
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
# Infer vector dimensions from first row
|
|
23
|
+
vec = rows[0].get("vector")
|
|
24
|
+
if not isinstance(vec, list) or not vec:
|
|
25
|
+
return
|
|
26
|
+
vector_dim = len(vec)
|
|
27
|
+
|
|
28
|
+
class ChunkRecord(LanceModel):
|
|
29
|
+
id: str
|
|
30
|
+
document_id: str
|
|
31
|
+
content: str
|
|
32
|
+
content_fts: str = Field(default="")
|
|
33
|
+
metadata: str = Field(default="{}")
|
|
34
|
+
order: int = Field(default=0)
|
|
35
|
+
vector: Vector(vector_dim) = Field( # type: ignore
|
|
36
|
+
default_factory=lambda: [0.0] * vector_dim
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Drop and recreate table with new schema
|
|
40
|
+
try:
|
|
41
|
+
store.db.drop_table("chunks")
|
|
42
|
+
except Exception:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
store.chunks_table = store.db.create_table("chunks", schema=ChunkRecord)
|
|
46
|
+
|
|
47
|
+
# Populate content_fts with contextualized content
|
|
48
|
+
new_records: list[ChunkRecord] = []
|
|
49
|
+
for row in rows:
|
|
50
|
+
metadata_raw = row.get("metadata") or "{}"
|
|
51
|
+
try:
|
|
52
|
+
metadata = (
|
|
53
|
+
json.loads(metadata_raw)
|
|
54
|
+
if isinstance(metadata_raw, str)
|
|
55
|
+
else metadata_raw
|
|
56
|
+
)
|
|
57
|
+
except Exception:
|
|
58
|
+
metadata = {}
|
|
59
|
+
|
|
60
|
+
headings = metadata.get("headings") if isinstance(metadata, dict) else None
|
|
61
|
+
content = row.get("content", "")
|
|
62
|
+
|
|
63
|
+
# Build contextualized content for FTS
|
|
64
|
+
if headings:
|
|
65
|
+
content_fts = "\n".join(headings) + "\n" + content
|
|
66
|
+
else:
|
|
67
|
+
content_fts = content
|
|
68
|
+
|
|
69
|
+
new_records.append(
|
|
70
|
+
ChunkRecord(
|
|
71
|
+
id=row.get("id"),
|
|
72
|
+
document_id=row.get("document_id"),
|
|
73
|
+
content=content,
|
|
74
|
+
content_fts=content_fts,
|
|
75
|
+
metadata=metadata_raw,
|
|
76
|
+
order=row.get("order", 0),
|
|
77
|
+
vector=row.get("vector") or [0.0] * vector_dim,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if new_records:
|
|
82
|
+
store.chunks_table.add(new_records)
|
|
83
|
+
|
|
84
|
+
# Drop old FTS index on content column if it exists
|
|
85
|
+
try:
|
|
86
|
+
store.chunks_table.drop_index("content_idx")
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
# Create FTS index on content_fts
|
|
91
|
+
store.chunks_table.create_fts_index(
|
|
92
|
+
"content_fts", replace=True, with_position=True, remove_stop_words=False
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
upgrade_contextualize_chunks = Upgrade(
|
|
97
|
+
version="0.23.1",
|
|
98
|
+
apply=_apply_add_content_fts,
|
|
99
|
+
description="Add content_fts column for contextualized FTS search",
|
|
100
|
+
)
|
|
@@ -7,7 +7,7 @@ from haiku.rag.store.engine import Store
|
|
|
7
7
|
from haiku.rag.store.upgrades import Upgrade
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def _infer_vector_dim(store: Store) -> int:
|
|
10
|
+
def _infer_vector_dim(store: Store) -> int: # pragma: no cover
|
|
11
11
|
"""Infer vector dimension from existing data; fallback to embedder config."""
|
|
12
12
|
try:
|
|
13
13
|
arrow = store.chunks_table.search().limit(1).to_arrow()
|
|
@@ -22,7 +22,7 @@ def _infer_vector_dim(store: Store) -> int:
|
|
|
22
22
|
return getattr(store.embedder, "_vector_dim", 1024)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def _apply_chunk_order(store: Store) -> None:
|
|
25
|
+
def _apply_chunk_order(store: Store) -> None: # pragma: no cover
|
|
26
26
|
"""Add integer 'order' column to chunks and backfill from metadata."""
|
|
27
27
|
|
|
28
28
|
vector_dim = _infer_vector_dim(store)
|
|
@@ -95,7 +95,7 @@ upgrade_order = Upgrade(
|
|
|
95
95
|
)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def _apply_fts_phrase_support(store: Store) -> None:
|
|
98
|
+
def _apply_fts_phrase_support(store: Store) -> None: # pragma: no cover
|
|
99
99
|
"""Recreate FTS index with phrase query support and no stop-word removal."""
|
|
100
100
|
try:
|
|
101
101
|
store.chunks_table.create_fts_index(
|