haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/client.py
CHANGED
|
@@ -1,25 +1,56 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import hashlib
|
|
3
|
+
import json
|
|
2
4
|
import logging
|
|
3
5
|
import mimetypes
|
|
4
6
|
import tempfile
|
|
5
7
|
from collections.abc import AsyncGenerator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum
|
|
6
11
|
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING, overload
|
|
7
13
|
from urllib.parse import urlparse
|
|
8
14
|
|
|
9
15
|
import httpx
|
|
10
16
|
|
|
11
17
|
from haiku.rag.config import AppConfig, Config
|
|
18
|
+
from haiku.rag.converters import get_converter
|
|
12
19
|
from haiku.rag.reranking import get_reranker
|
|
13
20
|
from haiku.rag.store.engine import Store
|
|
14
|
-
from haiku.rag.store.models.chunk import Chunk
|
|
21
|
+
from haiku.rag.store.models.chunk import Chunk, SearchResult
|
|
15
22
|
from haiku.rag.store.models.document import Document
|
|
16
23
|
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
17
24
|
from haiku.rag.store.repositories.document import DocumentRepository
|
|
18
25
|
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
19
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
29
|
+
|
|
30
|
+
from haiku.rag.graph.research.models import Citation
|
|
31
|
+
|
|
20
32
|
logger = logging.getLogger(__name__)
|
|
21
33
|
|
|
22
34
|
|
|
35
|
+
class RebuildMode(Enum):
|
|
36
|
+
"""Mode for rebuilding the database."""
|
|
37
|
+
|
|
38
|
+
FULL = "full" # Re-convert from source, re-chunk, re-embed
|
|
39
|
+
RECHUNK = "rechunk" # Re-chunk from existing content, re-embed
|
|
40
|
+
EMBED_ONLY = "embed_only" # Keep chunks, only regenerate embeddings
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class DownloadProgress:
|
|
45
|
+
"""Progress event for model downloads."""
|
|
46
|
+
|
|
47
|
+
model: str
|
|
48
|
+
status: str
|
|
49
|
+
completed: int = 0
|
|
50
|
+
total: int = 0
|
|
51
|
+
digest: str = ""
|
|
52
|
+
|
|
53
|
+
|
|
23
54
|
class HaikuRAG:
|
|
24
55
|
"""High-level haiku-rag client."""
|
|
25
56
|
|
|
@@ -28,7 +59,9 @@ class HaikuRAG:
|
|
|
28
59
|
db_path: Path | None = None,
|
|
29
60
|
config: AppConfig = Config,
|
|
30
61
|
skip_validation: bool = False,
|
|
31
|
-
|
|
62
|
+
create: bool = False,
|
|
63
|
+
read_only: bool = False,
|
|
64
|
+
before: datetime | None = None,
|
|
32
65
|
):
|
|
33
66
|
"""Initialize the RAG client with a database path.
|
|
34
67
|
|
|
@@ -36,21 +69,31 @@ class HaikuRAG:
|
|
|
36
69
|
db_path: Path to the database file. If None, uses config.storage.data_dir.
|
|
37
70
|
config: Configuration to use. Defaults to global Config.
|
|
38
71
|
skip_validation: Whether to skip configuration validation on database load.
|
|
39
|
-
|
|
40
|
-
|
|
72
|
+
create: Whether to create the database if it doesn't exist.
|
|
73
|
+
read_only: Whether to open the database in read-only mode.
|
|
74
|
+
before: Query the database as it existed at this datetime.
|
|
75
|
+
Implies read_only=True.
|
|
41
76
|
"""
|
|
42
77
|
self._config = config
|
|
43
78
|
if db_path is None:
|
|
44
79
|
db_path = self._config.storage.data_dir / "haiku.rag.lancedb"
|
|
80
|
+
|
|
45
81
|
self.store = Store(
|
|
46
82
|
db_path,
|
|
47
83
|
config=self._config,
|
|
48
84
|
skip_validation=skip_validation,
|
|
49
|
-
|
|
85
|
+
create=create,
|
|
86
|
+
read_only=read_only,
|
|
87
|
+
before=before,
|
|
50
88
|
)
|
|
51
89
|
self.document_repository = DocumentRepository(self.store)
|
|
52
90
|
self.chunk_repository = ChunkRepository(self.store)
|
|
53
91
|
|
|
92
|
+
@property
|
|
93
|
+
def is_read_only(self) -> bool:
|
|
94
|
+
"""Whether the client is in read-only mode."""
|
|
95
|
+
return self.store.is_read_only
|
|
96
|
+
|
|
54
97
|
async def __aenter__(self):
|
|
55
98
|
"""Async context manager entry."""
|
|
56
99
|
return self
|
|
@@ -63,65 +106,322 @@ class HaikuRAG:
|
|
|
63
106
|
self.close()
|
|
64
107
|
return False
|
|
65
108
|
|
|
66
|
-
|
|
109
|
+
# =========================================================================
|
|
110
|
+
# Processing Primitives
|
|
111
|
+
# =========================================================================
|
|
112
|
+
|
|
113
|
+
@overload
|
|
114
|
+
async def convert(self, source: Path) -> "DoclingDocument": ...
|
|
115
|
+
|
|
116
|
+
@overload
|
|
117
|
+
async def convert(
|
|
118
|
+
self, source: str, *, format: str = "md"
|
|
119
|
+
) -> "DoclingDocument": ...
|
|
120
|
+
|
|
121
|
+
async def convert(
|
|
122
|
+
self, source: Path | str, *, format: str = "md"
|
|
123
|
+
) -> "DoclingDocument":
|
|
124
|
+
"""Convert a file, URL, or text to DoclingDocument.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
source: One of:
|
|
128
|
+
- Path: Local file path to convert
|
|
129
|
+
- str (URL): HTTP/HTTPS URL to download and convert
|
|
130
|
+
- str (text): Raw text content to convert
|
|
131
|
+
format: The format of text content ("md", "html", or "plain").
|
|
132
|
+
Defaults to "md". Use "plain" for plain text without parsing.
|
|
133
|
+
Only used when source is raw text (not a file path or URL).
|
|
134
|
+
Files and URLs determine format from extension/content-type.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
DoclingDocument from the converted source.
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
ValueError: If the file doesn't exist or has unsupported extension.
|
|
141
|
+
httpx.RequestError: If URL download fails.
|
|
142
|
+
"""
|
|
143
|
+
converter = get_converter(self._config)
|
|
144
|
+
|
|
145
|
+
# Path object - convert file directly
|
|
146
|
+
if isinstance(source, Path):
|
|
147
|
+
if not source.exists():
|
|
148
|
+
raise ValueError(f"File does not exist: {source}")
|
|
149
|
+
if source.suffix.lower() not in converter.supported_extensions:
|
|
150
|
+
raise ValueError(f"Unsupported file extension: {source.suffix}")
|
|
151
|
+
return await converter.convert_file(source)
|
|
152
|
+
|
|
153
|
+
# String - check if URL or text
|
|
154
|
+
parsed = urlparse(source)
|
|
155
|
+
|
|
156
|
+
if parsed.scheme in ("http", "https"):
|
|
157
|
+
# URL - download and convert
|
|
158
|
+
async with httpx.AsyncClient() as http:
|
|
159
|
+
response = await http.get(source)
|
|
160
|
+
response.raise_for_status()
|
|
161
|
+
|
|
162
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
163
|
+
file_extension = self._get_extension_from_content_type_or_url(
|
|
164
|
+
source, content_type
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if file_extension not in converter.supported_extensions:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"Unsupported content type/extension: {content_type}/{file_extension}"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
with tempfile.NamedTemporaryFile(
|
|
173
|
+
mode="wb", suffix=file_extension, delete=False
|
|
174
|
+
) as temp_file:
|
|
175
|
+
temp_file.write(response.content)
|
|
176
|
+
temp_file.flush()
|
|
177
|
+
temp_path = Path(temp_file.name)
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
return await converter.convert_file(temp_path)
|
|
181
|
+
finally:
|
|
182
|
+
temp_path.unlink(missing_ok=True)
|
|
183
|
+
|
|
184
|
+
elif parsed.scheme == "file":
|
|
185
|
+
# file:// URI
|
|
186
|
+
file_path = Path(parsed.path)
|
|
187
|
+
if not file_path.exists():
|
|
188
|
+
raise ValueError(f"File does not exist: {file_path}")
|
|
189
|
+
if file_path.suffix.lower() not in converter.supported_extensions:
|
|
190
|
+
raise ValueError(f"Unsupported file extension: {file_path.suffix}")
|
|
191
|
+
return await converter.convert_file(file_path)
|
|
192
|
+
|
|
193
|
+
else:
|
|
194
|
+
# Treat as text content
|
|
195
|
+
return await converter.convert_text(source, format=format)
|
|
196
|
+
|
|
197
|
+
async def chunk(self, docling_document: "DoclingDocument") -> list[Chunk]:
|
|
198
|
+
"""Chunk a DoclingDocument into Chunks.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
docling_document: The DoclingDocument to chunk.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
List of Chunk objects (without embeddings, without document_id).
|
|
205
|
+
Each chunk has its `order` field set to its position in the list.
|
|
206
|
+
"""
|
|
207
|
+
from haiku.rag.chunkers import get_chunker
|
|
208
|
+
|
|
209
|
+
chunker = get_chunker(self._config)
|
|
210
|
+
return await chunker.chunk(docling_document)
|
|
211
|
+
|
|
212
|
+
async def _ensure_chunks_embedded(self, chunks: list[Chunk]) -> list[Chunk]:
|
|
213
|
+
"""Ensure all chunks have embeddings, embedding any that don't.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
chunks: List of chunks, some may have embeddings already.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of chunks with all embeddings populated.
|
|
220
|
+
"""
|
|
221
|
+
from haiku.rag.embeddings import embed_chunks
|
|
222
|
+
|
|
223
|
+
# Find chunks that need embedding
|
|
224
|
+
chunks_to_embed = [c for c in chunks if c.embedding is None]
|
|
225
|
+
|
|
226
|
+
if not chunks_to_embed:
|
|
227
|
+
return chunks
|
|
228
|
+
|
|
229
|
+
# Embed chunks that don't have embeddings (returns new Chunk objects)
|
|
230
|
+
embedded = await embed_chunks(chunks_to_embed, self._config)
|
|
231
|
+
|
|
232
|
+
# Build result maintaining original order
|
|
233
|
+
embedded_map = {(c.content, c.order): c for c in embedded}
|
|
234
|
+
result = []
|
|
235
|
+
for chunk in chunks:
|
|
236
|
+
if chunk.embedding is not None:
|
|
237
|
+
result.append(chunk)
|
|
238
|
+
else:
|
|
239
|
+
result.append(embedded_map[(chunk.content, chunk.order)])
|
|
240
|
+
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
async def _store_document_with_chunks(
|
|
244
|
+
self,
|
|
245
|
+
document: Document,
|
|
246
|
+
chunks: list[Chunk],
|
|
247
|
+
) -> Document:
|
|
248
|
+
"""Store a document with chunks, embedding any that lack embeddings.
|
|
249
|
+
|
|
250
|
+
Handles versioning/rollback on failure.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
document: The document to store (will be created).
|
|
254
|
+
chunks: Chunks to store (will be embedded if lacking embeddings).
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
The created Document instance with ID set.
|
|
258
|
+
"""
|
|
259
|
+
import asyncio
|
|
260
|
+
|
|
261
|
+
# Ensure all chunks have embeddings before storing
|
|
262
|
+
chunks = await self._ensure_chunks_embedded(chunks)
|
|
263
|
+
|
|
264
|
+
# Snapshot table versions for versioned rollback (if supported)
|
|
265
|
+
versions = self.store.current_table_versions()
|
|
266
|
+
|
|
267
|
+
# Create the document
|
|
268
|
+
created_doc = await self.document_repository.create(document)
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
assert created_doc.id is not None, (
|
|
272
|
+
"Document ID should not be None after creation"
|
|
273
|
+
)
|
|
274
|
+
# Set document_id and order for all chunks
|
|
275
|
+
for order, chunk in enumerate(chunks):
|
|
276
|
+
chunk.document_id = created_doc.id
|
|
277
|
+
chunk.order = order
|
|
278
|
+
|
|
279
|
+
# Batch create all chunks in a single operation
|
|
280
|
+
await self.chunk_repository.create(chunks)
|
|
281
|
+
|
|
282
|
+
# Vacuum old versions in background (non-blocking) if auto_vacuum enabled
|
|
283
|
+
if self._config.storage.auto_vacuum:
|
|
284
|
+
asyncio.create_task(self.store.vacuum())
|
|
285
|
+
|
|
286
|
+
return created_doc
|
|
287
|
+
except Exception:
|
|
288
|
+
# Roll back to the captured versions and re-raise
|
|
289
|
+
self.store.restore_table_versions(versions)
|
|
290
|
+
raise
|
|
291
|
+
|
|
292
|
+
async def _update_document_with_chunks(
|
|
293
|
+
self,
|
|
294
|
+
document: Document,
|
|
295
|
+
chunks: list[Chunk],
|
|
296
|
+
) -> Document:
|
|
297
|
+
"""Update a document and replace its chunks, embedding any that lack embeddings.
|
|
298
|
+
|
|
299
|
+
Handles versioning/rollback on failure.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
document: The document to update (must have ID set).
|
|
303
|
+
chunks: Chunks to replace existing (will be embedded if lacking embeddings).
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
The updated Document instance.
|
|
307
|
+
"""
|
|
308
|
+
import asyncio
|
|
309
|
+
|
|
310
|
+
assert document.id is not None, "Document ID is required for update"
|
|
311
|
+
|
|
312
|
+
# Ensure all chunks have embeddings before storing
|
|
313
|
+
chunks = await self._ensure_chunks_embedded(chunks)
|
|
314
|
+
|
|
315
|
+
# Snapshot table versions for versioned rollback
|
|
316
|
+
versions = self.store.current_table_versions()
|
|
317
|
+
|
|
318
|
+
# Delete existing chunks before writing new ones
|
|
319
|
+
await self.chunk_repository.delete_by_document_id(document.id)
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
# Update the document
|
|
323
|
+
updated_doc = await self.document_repository.update(document)
|
|
324
|
+
|
|
325
|
+
# Set document_id and order for all chunks
|
|
326
|
+
assert updated_doc.id is not None
|
|
327
|
+
for order, chunk in enumerate(chunks):
|
|
328
|
+
chunk.document_id = updated_doc.id
|
|
329
|
+
chunk.order = order
|
|
330
|
+
|
|
331
|
+
# Batch create all chunks in a single operation
|
|
332
|
+
await self.chunk_repository.create(chunks)
|
|
333
|
+
|
|
334
|
+
# Vacuum old versions in background (non-blocking) if auto_vacuum enabled
|
|
335
|
+
if self._config.storage.auto_vacuum:
|
|
336
|
+
asyncio.create_task(self.store.vacuum())
|
|
337
|
+
|
|
338
|
+
return updated_doc
|
|
339
|
+
except Exception:
|
|
340
|
+
# Roll back to the captured versions and re-raise
|
|
341
|
+
self.store.restore_table_versions(versions)
|
|
342
|
+
raise
|
|
343
|
+
|
|
344
|
+
async def create_document(
|
|
67
345
|
self,
|
|
68
|
-
|
|
346
|
+
content: str,
|
|
69
347
|
uri: str | None = None,
|
|
70
348
|
title: str | None = None,
|
|
71
349
|
metadata: dict | None = None,
|
|
72
|
-
|
|
350
|
+
format: str = "md",
|
|
73
351
|
) -> Document:
|
|
74
|
-
"""Create a new document from
|
|
75
|
-
|
|
352
|
+
"""Create a new document from text content.
|
|
353
|
+
|
|
354
|
+
Converts the content, chunks it, and generates embeddings.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
content: The text content of the document.
|
|
358
|
+
uri: Optional URI identifier for the document.
|
|
359
|
+
title: Optional title for the document.
|
|
360
|
+
metadata: Optional metadata dictionary.
|
|
361
|
+
format: The format of the content ("md", "html", or "plain").
|
|
362
|
+
Defaults to "md". Use "plain" for plain text without parsing.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
The created Document instance.
|
|
366
|
+
"""
|
|
367
|
+
from haiku.rag.embeddings import embed_chunks
|
|
368
|
+
|
|
369
|
+
# Convert → Chunk → Embed using primitives
|
|
370
|
+
docling_document = await self.convert(content, format=format)
|
|
371
|
+
chunks = await self.chunk(docling_document)
|
|
372
|
+
embedded_chunks = await embed_chunks(chunks, self._config)
|
|
373
|
+
|
|
374
|
+
# Store markdown export as content for better display/readability
|
|
375
|
+
# The original content is preserved in docling_document_json
|
|
376
|
+
stored_content = docling_document.export_to_markdown()
|
|
377
|
+
|
|
378
|
+
# Create document model
|
|
76
379
|
document = Document(
|
|
77
|
-
content=
|
|
380
|
+
content=stored_content,
|
|
78
381
|
uri=uri,
|
|
79
382
|
title=title,
|
|
80
383
|
metadata=metadata or {},
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
document, docling_document, chunks
|
|
384
|
+
docling_document_json=docling_document.model_dump_json(),
|
|
385
|
+
docling_version=docling_document.version,
|
|
84
386
|
)
|
|
85
387
|
|
|
86
|
-
|
|
388
|
+
# Store document and chunks
|
|
389
|
+
return await self._store_document_with_chunks(document, embedded_chunks)
|
|
390
|
+
|
|
391
|
+
async def import_document(
|
|
87
392
|
self,
|
|
88
|
-
|
|
393
|
+
docling_document: "DoclingDocument",
|
|
394
|
+
chunks: list[Chunk],
|
|
89
395
|
uri: str | None = None,
|
|
90
396
|
title: str | None = None,
|
|
91
397
|
metadata: dict | None = None,
|
|
92
|
-
chunks: list[Chunk] | None = None,
|
|
93
398
|
) -> Document:
|
|
94
|
-
"""
|
|
399
|
+
"""Import a pre-processed document with chunks.
|
|
400
|
+
|
|
401
|
+
Use this when document conversion, chunking, and embedding were done
|
|
402
|
+
externally and you want to store the results in haiku.rag.
|
|
95
403
|
|
|
96
404
|
Args:
|
|
97
|
-
|
|
405
|
+
docling_document: The DoclingDocument to import.
|
|
406
|
+
chunks: Pre-created chunks. Chunks without embeddings will be
|
|
407
|
+
automatically embedded.
|
|
98
408
|
uri: Optional URI identifier for the document.
|
|
409
|
+
title: Optional title for the document.
|
|
99
410
|
metadata: Optional metadata dictionary.
|
|
100
|
-
chunks: Optional list of pre-created chunks to use instead of generating new ones.
|
|
101
411
|
|
|
102
412
|
Returns:
|
|
103
413
|
The created Document instance.
|
|
104
414
|
"""
|
|
105
415
|
document = Document(
|
|
106
|
-
content=
|
|
416
|
+
content=docling_document.export_to_markdown(),
|
|
107
417
|
uri=uri,
|
|
108
418
|
title=title,
|
|
109
419
|
metadata=metadata or {},
|
|
420
|
+
docling_document_json=docling_document.model_dump_json(),
|
|
421
|
+
docling_version=docling_document.version,
|
|
110
422
|
)
|
|
111
423
|
|
|
112
|
-
|
|
113
|
-
if chunks is None:
|
|
114
|
-
# Lazy import to avoid loading docling
|
|
115
|
-
from haiku.rag.utils import text_to_docling_document
|
|
116
|
-
|
|
117
|
-
docling_document = text_to_docling_document(content)
|
|
118
|
-
else:
|
|
119
|
-
# Chunks already provided, no conversion needed
|
|
120
|
-
docling_document = None
|
|
121
|
-
|
|
122
|
-
return await self.document_repository._create_and_chunk(
|
|
123
|
-
document, docling_document, chunks
|
|
124
|
-
)
|
|
424
|
+
return await self._store_document_with_chunks(document, chunks)
|
|
125
425
|
|
|
126
426
|
async def create_document_from_source(
|
|
127
427
|
self, source: str | Path, title: str | None = None, metadata: dict | None = None
|
|
@@ -201,12 +501,12 @@ class HaikuRAG:
|
|
|
201
501
|
Raises:
|
|
202
502
|
ValueError: If the file cannot be parsed or doesn't exist
|
|
203
503
|
"""
|
|
204
|
-
|
|
205
|
-
from haiku.rag.reader import FileReader
|
|
504
|
+
from haiku.rag.embeddings import embed_chunks
|
|
206
505
|
|
|
207
506
|
metadata = metadata or {}
|
|
208
507
|
|
|
209
|
-
|
|
508
|
+
converter = get_converter(self._config)
|
|
509
|
+
if source_path.suffix.lower() not in converter.supported_extensions:
|
|
210
510
|
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
211
511
|
|
|
212
512
|
if not source_path.exists():
|
|
@@ -241,26 +541,33 @@ class HaikuRAG:
|
|
|
241
541
|
return await self.document_repository.update(existing_doc)
|
|
242
542
|
return existing_doc
|
|
243
543
|
|
|
244
|
-
#
|
|
245
|
-
docling_document =
|
|
544
|
+
# Convert → Chunk → Embed using primitives
|
|
545
|
+
docling_document = await self.convert(source_path)
|
|
546
|
+
chunks = await self.chunk(docling_document)
|
|
547
|
+
embedded_chunks = await embed_chunks(chunks, self._config)
|
|
246
548
|
|
|
247
549
|
if existing_doc:
|
|
248
|
-
# Update existing document
|
|
550
|
+
# Update existing document and rechunk
|
|
249
551
|
existing_doc.content = docling_document.export_to_markdown()
|
|
250
552
|
existing_doc.metadata = metadata
|
|
553
|
+
existing_doc.docling_document_json = docling_document.model_dump_json()
|
|
554
|
+
existing_doc.docling_version = docling_document.version
|
|
251
555
|
if title is not None:
|
|
252
556
|
existing_doc.title = title
|
|
253
|
-
return await self.
|
|
254
|
-
existing_doc,
|
|
557
|
+
return await self._update_document_with_chunks(
|
|
558
|
+
existing_doc, embedded_chunks
|
|
255
559
|
)
|
|
256
560
|
else:
|
|
257
|
-
# Create new document
|
|
258
|
-
|
|
259
|
-
|
|
561
|
+
# Create new document
|
|
562
|
+
document = Document(
|
|
563
|
+
content=docling_document.export_to_markdown(),
|
|
260
564
|
uri=uri,
|
|
261
565
|
title=title,
|
|
262
566
|
metadata=metadata,
|
|
567
|
+
docling_document_json=docling_document.model_dump_json(),
|
|
568
|
+
docling_version=docling_document.version,
|
|
263
569
|
)
|
|
570
|
+
return await self._store_document_with_chunks(document, embedded_chunks)
|
|
264
571
|
|
|
265
572
|
async def _create_or_update_document_from_url(
|
|
266
573
|
self, url: str, title: str | None = None, metadata: dict | None = None
|
|
@@ -283,11 +590,13 @@ class HaikuRAG:
|
|
|
283
590
|
ValueError: If the content cannot be parsed
|
|
284
591
|
httpx.RequestError: If URL request fails
|
|
285
592
|
"""
|
|
286
|
-
|
|
287
|
-
from haiku.rag.reader import FileReader
|
|
593
|
+
from haiku.rag.embeddings import embed_chunks
|
|
288
594
|
|
|
289
595
|
metadata = metadata or {}
|
|
290
596
|
|
|
597
|
+
converter = get_converter(self._config)
|
|
598
|
+
supported_extensions = converter.supported_extensions
|
|
599
|
+
|
|
291
600
|
async with httpx.AsyncClient() as client:
|
|
292
601
|
response = await client.get(url)
|
|
293
602
|
response.raise_for_status()
|
|
@@ -320,40 +629,52 @@ class HaikuRAG:
|
|
|
320
629
|
url, content_type
|
|
321
630
|
)
|
|
322
631
|
|
|
323
|
-
if file_extension not in
|
|
632
|
+
if file_extension not in supported_extensions:
|
|
324
633
|
raise ValueError(
|
|
325
634
|
f"Unsupported content type/extension: {content_type}/{file_extension}"
|
|
326
635
|
)
|
|
327
636
|
|
|
328
637
|
# Create a temporary file with the appropriate extension
|
|
329
638
|
with tempfile.NamedTemporaryFile(
|
|
330
|
-
mode="wb", suffix=file_extension
|
|
639
|
+
mode="wb", suffix=file_extension, delete=False
|
|
331
640
|
) as temp_file:
|
|
332
641
|
temp_file.write(response.content)
|
|
333
|
-
temp_file.flush()
|
|
642
|
+
temp_file.flush()
|
|
334
643
|
temp_path = Path(temp_file.name)
|
|
335
644
|
|
|
336
|
-
|
|
337
|
-
|
|
645
|
+
try:
|
|
646
|
+
# Convert → Chunk → Embed using primitives
|
|
647
|
+
docling_document = await self.convert(temp_path)
|
|
648
|
+
chunks = await self.chunk(docling_document)
|
|
649
|
+
embedded_chunks = await embed_chunks(chunks, self._config)
|
|
650
|
+
finally:
|
|
651
|
+
temp_path.unlink(missing_ok=True)
|
|
338
652
|
|
|
339
653
|
# Merge metadata with contentType and md5
|
|
340
654
|
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
341
655
|
|
|
342
656
|
if existing_doc:
|
|
657
|
+
# Update existing document and rechunk
|
|
343
658
|
existing_doc.content = docling_document.export_to_markdown()
|
|
344
659
|
existing_doc.metadata = metadata
|
|
660
|
+
existing_doc.docling_document_json = docling_document.model_dump_json()
|
|
661
|
+
existing_doc.docling_version = docling_document.version
|
|
345
662
|
if title is not None:
|
|
346
663
|
existing_doc.title = title
|
|
347
|
-
return await self.
|
|
348
|
-
existing_doc,
|
|
664
|
+
return await self._update_document_with_chunks(
|
|
665
|
+
existing_doc, embedded_chunks
|
|
349
666
|
)
|
|
350
667
|
else:
|
|
351
|
-
|
|
352
|
-
|
|
668
|
+
# Create new document
|
|
669
|
+
document = Document(
|
|
670
|
+
content=docling_document.export_to_markdown(),
|
|
353
671
|
uri=url,
|
|
354
672
|
title=title,
|
|
355
673
|
metadata=metadata,
|
|
674
|
+
docling_document_json=docling_document.model_dump_json(),
|
|
675
|
+
docling_version=docling_document.version,
|
|
356
676
|
)
|
|
677
|
+
return await self._store_document_with_chunks(document, embedded_chunks)
|
|
357
678
|
|
|
358
679
|
def _get_extension_from_content_type_or_url(
|
|
359
680
|
self, url: str, content_type: str
|
|
@@ -408,17 +729,93 @@ class HaikuRAG:
|
|
|
408
729
|
"""
|
|
409
730
|
return await self.document_repository.get_by_uri(uri)
|
|
410
731
|
|
|
411
|
-
async def update_document(
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
732
|
+
async def update_document(
|
|
733
|
+
self,
|
|
734
|
+
document_id: str,
|
|
735
|
+
content: str | None = None,
|
|
736
|
+
metadata: dict | None = None,
|
|
737
|
+
chunks: list[Chunk] | None = None,
|
|
738
|
+
title: str | None = None,
|
|
739
|
+
docling_document: "DoclingDocument | None" = None,
|
|
740
|
+
) -> Document:
|
|
741
|
+
"""Update a document by ID.
|
|
415
742
|
|
|
416
|
-
|
|
417
|
-
|
|
743
|
+
Updates specified fields. When content or docling_document is provided,
|
|
744
|
+
the document is rechunked and re-embedded. Updates to only metadata or title
|
|
745
|
+
skip rechunking for efficiency.
|
|
418
746
|
|
|
419
|
-
|
|
420
|
-
document
|
|
421
|
-
|
|
747
|
+
Args:
|
|
748
|
+
document_id: The ID of the document to update.
|
|
749
|
+
content: New content (mutually exclusive with docling_document).
|
|
750
|
+
metadata: New metadata dict.
|
|
751
|
+
chunks: Custom chunks (will be embedded if missing embeddings).
|
|
752
|
+
title: New title.
|
|
753
|
+
docling_document: DoclingDocument to replace content (mutually exclusive with content).
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
The updated Document instance.
|
|
757
|
+
|
|
758
|
+
Raises:
|
|
759
|
+
ValueError: If document not found, or if both content and docling_document
|
|
760
|
+
are provided.
|
|
761
|
+
"""
|
|
762
|
+
from haiku.rag.embeddings import embed_chunks
|
|
763
|
+
|
|
764
|
+
# Validate: content and docling_document are mutually exclusive
|
|
765
|
+
if content is not None and docling_document is not None:
|
|
766
|
+
raise ValueError(
|
|
767
|
+
"content and docling_document are mutually exclusive. "
|
|
768
|
+
"Provide one or the other, not both."
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Fetch the existing document
|
|
772
|
+
existing_doc = await self.get_document_by_id(document_id)
|
|
773
|
+
if existing_doc is None:
|
|
774
|
+
raise ValueError(f"Document with ID {document_id} not found")
|
|
775
|
+
|
|
776
|
+
# Update metadata/title fields
|
|
777
|
+
if title is not None:
|
|
778
|
+
existing_doc.title = title
|
|
779
|
+
if metadata is not None:
|
|
780
|
+
existing_doc.metadata = metadata
|
|
781
|
+
|
|
782
|
+
# Only metadata/title update - no rechunking needed
|
|
783
|
+
if content is None and chunks is None and docling_document is None:
|
|
784
|
+
return await self.document_repository.update(existing_doc)
|
|
785
|
+
|
|
786
|
+
# Custom chunks provided - use them as-is
|
|
787
|
+
if chunks is not None:
|
|
788
|
+
# Store docling data if provided
|
|
789
|
+
if docling_document is not None:
|
|
790
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
791
|
+
existing_doc.docling_document_json = docling_document.model_dump_json()
|
|
792
|
+
existing_doc.docling_version = docling_document.version
|
|
793
|
+
elif content is not None:
|
|
794
|
+
existing_doc.content = content
|
|
795
|
+
|
|
796
|
+
return await self._update_document_with_chunks(existing_doc, chunks)
|
|
797
|
+
|
|
798
|
+
# DoclingDocument provided without chunks - chunk and embed using primitives
|
|
799
|
+
if docling_document is not None:
|
|
800
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
801
|
+
existing_doc.docling_document_json = docling_document.model_dump_json()
|
|
802
|
+
existing_doc.docling_version = docling_document.version
|
|
803
|
+
|
|
804
|
+
new_chunks = await self.chunk(docling_document)
|
|
805
|
+
embedded_chunks = await embed_chunks(new_chunks, self._config)
|
|
806
|
+
return await self._update_document_with_chunks(
|
|
807
|
+
existing_doc, embedded_chunks
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Content provided without chunks - convert, chunk, and embed using primitives
|
|
811
|
+
existing_doc.content = content # type: ignore[assignment]
|
|
812
|
+
converted_docling = await self.convert(existing_doc.content)
|
|
813
|
+
existing_doc.docling_document_json = converted_docling.model_dump_json()
|
|
814
|
+
existing_doc.docling_version = converted_docling.version
|
|
815
|
+
|
|
816
|
+
new_chunks = await self.chunk(converted_docling)
|
|
817
|
+
embedded_chunks = await embed_chunks(new_chunks, self._config)
|
|
818
|
+
return await self._update_document_with_chunks(existing_doc, embedded_chunks)
|
|
422
819
|
|
|
423
820
|
async def delete_document(self, document_id: str) -> bool:
|
|
424
821
|
"""Delete a document by its ID."""
|
|
@@ -447,285 +844,873 @@ class HaikuRAG:
|
|
|
447
844
|
async def search(
|
|
448
845
|
self,
|
|
449
846
|
query: str,
|
|
450
|
-
limit: int =
|
|
847
|
+
limit: int | None = None,
|
|
451
848
|
search_type: str = "hybrid",
|
|
452
849
|
filter: str | None = None,
|
|
453
|
-
) -> list[
|
|
850
|
+
) -> list[SearchResult]:
|
|
454
851
|
"""Search for relevant chunks using the specified search method with optional reranking.
|
|
455
852
|
|
|
456
853
|
Args:
|
|
457
854
|
query: The search query string.
|
|
458
|
-
limit: Maximum number of results to return.
|
|
855
|
+
limit: Maximum number of results to return. Defaults to config.search.default_limit.
|
|
459
856
|
search_type: Type of search - "vector", "fts", or "hybrid" (default).
|
|
460
857
|
filter: Optional SQL WHERE clause to filter documents before searching chunks.
|
|
461
858
|
|
|
462
859
|
Returns:
|
|
463
|
-
List of
|
|
860
|
+
List of SearchResult objects ordered by relevance.
|
|
464
861
|
"""
|
|
465
|
-
|
|
862
|
+
if limit is None:
|
|
863
|
+
limit = self._config.search.limit
|
|
864
|
+
|
|
466
865
|
reranker = get_reranker(config=self._config)
|
|
467
866
|
|
|
468
867
|
if reranker is None:
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
chunks = [chunk for chunk, _ in search_results]
|
|
480
|
-
reranked_results = await reranker.rerank(query, chunks, top_n=limit)
|
|
868
|
+
chunk_results = await self.chunk_repository.search(
|
|
869
|
+
query, limit, search_type, filter
|
|
870
|
+
)
|
|
871
|
+
else:
|
|
872
|
+
search_limit = limit * 10
|
|
873
|
+
raw_results = await self.chunk_repository.search(
|
|
874
|
+
query, search_limit, search_type, filter
|
|
875
|
+
)
|
|
876
|
+
chunks = [chunk for chunk, _ in raw_results]
|
|
877
|
+
chunk_results = await reranker.rerank(query, chunks, top_n=limit)
|
|
481
878
|
|
|
482
|
-
|
|
483
|
-
return reranked_results
|
|
879
|
+
return [SearchResult.from_chunk(chunk, score) for chunk, score in chunk_results]
|
|
484
880
|
|
|
485
881
|
async def expand_context(
|
|
486
882
|
self,
|
|
487
|
-
search_results: list[
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
883
|
+
search_results: list[SearchResult],
|
|
884
|
+
) -> list[SearchResult]:
|
|
885
|
+
"""Expand search results with adjacent content from the source document.
|
|
886
|
+
|
|
887
|
+
When DoclingDocument is available and results have doc_item_refs, expands
|
|
888
|
+
by finding adjacent DocItems with accurate bounding boxes and metadata.
|
|
889
|
+
Otherwise, falls back to chunk-based expansion using adjacent chunks.
|
|
890
|
+
|
|
891
|
+
Expansion is type-aware based on content:
|
|
892
|
+
- Tables, code blocks, and lists expand to include complete structures
|
|
893
|
+
- Text content uses the configured radius (search.context_radius)
|
|
894
|
+
- Expansion is limited by search.max_context_items and search.max_context_chars
|
|
491
895
|
|
|
492
896
|
Args:
|
|
493
|
-
search_results: List of
|
|
494
|
-
radius: Number of adjacent chunks to include before/after each chunk.
|
|
495
|
-
If None, uses config.processing.context_chunk_radius.
|
|
897
|
+
search_results: List of SearchResult objects from search.
|
|
496
898
|
|
|
497
899
|
Returns:
|
|
498
|
-
List of
|
|
900
|
+
List of SearchResult objects with expanded content and resolved provenance.
|
|
499
901
|
"""
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
doc_id = chunk.document_id
|
|
902
|
+
radius = self._config.search.context_radius
|
|
903
|
+
max_items = self._config.search.max_context_items
|
|
904
|
+
max_chars = self._config.search.max_context_chars
|
|
905
|
+
|
|
906
|
+
# Group by document_id for efficient processing
|
|
907
|
+
document_groups: dict[str | None, list[SearchResult]] = {}
|
|
908
|
+
for result in search_results:
|
|
909
|
+
doc_id = result.document_id
|
|
509
910
|
if doc_id not in document_groups:
|
|
510
911
|
document_groups[doc_id] = []
|
|
511
|
-
document_groups[doc_id].append(
|
|
912
|
+
document_groups[doc_id].append(result)
|
|
913
|
+
|
|
914
|
+
expanded_results = []
|
|
915
|
+
|
|
916
|
+
for doc_id, doc_results in document_groups.items():
|
|
917
|
+
if doc_id is None:
|
|
918
|
+
expanded_results.extend(doc_results)
|
|
919
|
+
continue
|
|
920
|
+
|
|
921
|
+
# Fetch the document to get DoclingDocument
|
|
922
|
+
doc = await self.get_document_by_id(doc_id)
|
|
923
|
+
if doc is None:
|
|
924
|
+
expanded_results.extend(doc_results)
|
|
925
|
+
continue
|
|
926
|
+
|
|
927
|
+
docling_doc = doc.get_docling_document()
|
|
928
|
+
|
|
929
|
+
# Check if we can use DoclingDocument-based expansion
|
|
930
|
+
has_docling = docling_doc is not None
|
|
931
|
+
has_refs = any(r.doc_item_refs for r in doc_results)
|
|
932
|
+
|
|
933
|
+
if has_docling and has_refs:
|
|
934
|
+
# Use DoclingDocument-based expansion
|
|
935
|
+
expanded = await self._expand_with_docling(
|
|
936
|
+
doc_results,
|
|
937
|
+
docling_doc,
|
|
938
|
+
radius,
|
|
939
|
+
max_items,
|
|
940
|
+
max_chars,
|
|
941
|
+
)
|
|
942
|
+
expanded_results.extend(expanded)
|
|
943
|
+
else:
|
|
944
|
+
# Fall back to chunk-based expansion (always uses fixed radius)
|
|
945
|
+
if radius > 0:
|
|
946
|
+
expanded = await self._expand_with_chunks(
|
|
947
|
+
doc_id, doc_results, radius
|
|
948
|
+
)
|
|
949
|
+
expanded_results.extend(expanded)
|
|
950
|
+
else:
|
|
951
|
+
expanded_results.extend(doc_results)
|
|
512
952
|
|
|
513
|
-
|
|
953
|
+
return expanded_results
|
|
514
954
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
)
|
|
955
|
+
def _merge_ranges(
|
|
956
|
+
self, ranges: list[tuple[int, int, SearchResult]]
|
|
957
|
+
) -> list[tuple[int, int, list[SearchResult]]]:
|
|
958
|
+
"""Merge overlapping or adjacent ranges."""
|
|
959
|
+
if not ranges:
|
|
960
|
+
return []
|
|
522
961
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
expanded_ranges.append(
|
|
531
|
-
{
|
|
532
|
-
"original_chunk": chunk,
|
|
533
|
-
"score": score,
|
|
534
|
-
"min_order": min_order,
|
|
535
|
-
"max_order": max_order,
|
|
536
|
-
"all_chunks": sorted(all_chunks, key=lambda c: c.order),
|
|
537
|
-
}
|
|
538
|
-
)
|
|
962
|
+
sorted_ranges = sorted(ranges, key=lambda x: x[0])
|
|
963
|
+
merged: list[tuple[int, int, list[SearchResult]]] = []
|
|
964
|
+
cur_min, cur_max, cur_results = (
|
|
965
|
+
sorted_ranges[0][0],
|
|
966
|
+
sorted_ranges[0][1],
|
|
967
|
+
[sorted_ranges[0][2]],
|
|
968
|
+
)
|
|
539
969
|
|
|
540
|
-
|
|
541
|
-
|
|
970
|
+
for min_idx, max_idx, result in sorted_ranges[1:]:
|
|
971
|
+
if cur_max >= min_idx - 1: # Overlapping or adjacent
|
|
972
|
+
cur_max = max(cur_max, max_idx)
|
|
973
|
+
cur_results.append(result)
|
|
974
|
+
else:
|
|
975
|
+
merged.append((cur_min, cur_max, cur_results))
|
|
976
|
+
cur_min, cur_max, cur_results = min_idx, max_idx, [result]
|
|
542
977
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
combined_content_parts = [c.content for c in merged_range["all_chunks"]]
|
|
978
|
+
merged.append((cur_min, cur_max, cur_results))
|
|
979
|
+
return merged
|
|
546
980
|
|
|
547
|
-
|
|
548
|
-
|
|
981
|
+
# Label groups for type-aware expansion
|
|
982
|
+
_STRUCTURAL_LABELS = {"table", "code", "list_item", "form", "key_value_region"}
|
|
549
983
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
document_id=original_chunk.document_id,
|
|
553
|
-
content="".join(combined_content_parts),
|
|
554
|
-
metadata=original_chunk.metadata,
|
|
555
|
-
document_uri=original_chunk.document_uri,
|
|
556
|
-
document_title=original_chunk.document_title,
|
|
557
|
-
document_meta=original_chunk.document_meta,
|
|
558
|
-
)
|
|
984
|
+
def _extract_item_text(self, item, docling_doc) -> str | None:
|
|
985
|
+
"""Extract text content from a DocItem.
|
|
559
986
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
987
|
+
Handles different item types:
|
|
988
|
+
- TextItem, SectionHeaderItem, etc.: Use .text attribute
|
|
989
|
+
- TableItem: Use export_to_markdown() for table content
|
|
990
|
+
- PictureItem: Use caption if available
|
|
991
|
+
"""
|
|
992
|
+
# Try simple text attribute first (works for most items)
|
|
993
|
+
if text := getattr(item, "text", None):
|
|
994
|
+
return text
|
|
995
|
+
|
|
996
|
+
# For tables, export as markdown
|
|
997
|
+
if hasattr(item, "export_to_markdown"):
|
|
998
|
+
try:
|
|
999
|
+
return item.export_to_markdown(docling_doc)
|
|
1000
|
+
except Exception:
|
|
1001
|
+
pass
|
|
1002
|
+
|
|
1003
|
+
# For pictures/charts, try to get caption
|
|
1004
|
+
if caption := getattr(item, "caption", None):
|
|
1005
|
+
if hasattr(caption, "text"):
|
|
1006
|
+
return caption.text
|
|
1007
|
+
|
|
1008
|
+
return None
|
|
1009
|
+
|
|
1010
|
+
def _get_item_label(self, item) -> str | None:
|
|
1011
|
+
"""Extract label string from a DocItem."""
|
|
1012
|
+
label = getattr(item, "label", None)
|
|
1013
|
+
if label is None:
|
|
1014
|
+
return None
|
|
1015
|
+
return str(label.value) if hasattr(label, "value") else str(label)
|
|
1016
|
+
|
|
1017
|
+
def _compute_type_aware_range(
|
|
1018
|
+
self,
|
|
1019
|
+
all_items: list,
|
|
1020
|
+
indices: list[int],
|
|
1021
|
+
radius: int,
|
|
1022
|
+
max_items: int,
|
|
1023
|
+
max_chars: int,
|
|
1024
|
+
) -> tuple[int, int]:
|
|
1025
|
+
"""Compute expansion range based on content type with limits.
|
|
1026
|
+
|
|
1027
|
+
For structural content (tables, code, lists), expands to include complete
|
|
1028
|
+
structures. For text, uses the configured radius. Applies hybrid limits.
|
|
1029
|
+
"""
|
|
1030
|
+
if not indices:
|
|
1031
|
+
return (0, 0)
|
|
1032
|
+
|
|
1033
|
+
min_idx = min(indices)
|
|
1034
|
+
max_idx = max(indices)
|
|
1035
|
+
|
|
1036
|
+
# Determine the primary label type from matched items
|
|
1037
|
+
labels_in_chunk = set()
|
|
1038
|
+
for idx in indices:
|
|
1039
|
+
item, _ = all_items[idx]
|
|
1040
|
+
if label := self._get_item_label(item):
|
|
1041
|
+
labels_in_chunk.add(label)
|
|
1042
|
+
|
|
1043
|
+
# Check if we have structural content
|
|
1044
|
+
is_structural = bool(labels_in_chunk & self._STRUCTURAL_LABELS)
|
|
1045
|
+
|
|
1046
|
+
if is_structural:
|
|
1047
|
+
# Expand to complete structure boundaries
|
|
1048
|
+
# Expand backwards to find structure start
|
|
1049
|
+
while min_idx > 0:
|
|
1050
|
+
prev_item, _ = all_items[min_idx - 1]
|
|
1051
|
+
prev_label = self._get_item_label(prev_item)
|
|
1052
|
+
if prev_label in labels_in_chunk & self._STRUCTURAL_LABELS:
|
|
1053
|
+
min_idx -= 1
|
|
1054
|
+
else:
|
|
1055
|
+
break
|
|
1056
|
+
|
|
1057
|
+
# Expand forwards to find structure end
|
|
1058
|
+
while max_idx < len(all_items) - 1:
|
|
1059
|
+
next_item, _ = all_items[max_idx + 1]
|
|
1060
|
+
next_label = self._get_item_label(next_item)
|
|
1061
|
+
if next_label in labels_in_chunk & self._STRUCTURAL_LABELS:
|
|
1062
|
+
max_idx += 1
|
|
1063
|
+
else:
|
|
1064
|
+
break
|
|
1065
|
+
else:
|
|
1066
|
+
# Text content: use radius-based expansion
|
|
1067
|
+
min_idx = max(0, min_idx - radius)
|
|
1068
|
+
max_idx = min(len(all_items) - 1, max_idx + radius)
|
|
1069
|
+
|
|
1070
|
+
# Apply hybrid limits
|
|
1071
|
+
# First check item count hard limit
|
|
1072
|
+
if max_idx - min_idx + 1 > max_items:
|
|
1073
|
+
# Center the window around original indices
|
|
1074
|
+
original_center = (min(indices) + max(indices)) // 2
|
|
1075
|
+
half_items = max_items // 2
|
|
1076
|
+
min_idx = max(0, original_center - half_items)
|
|
1077
|
+
max_idx = min(len(all_items) - 1, min_idx + max_items - 1)
|
|
1078
|
+
|
|
1079
|
+
# Then check character soft limit (but keep at least original items)
|
|
1080
|
+
char_count = 0
|
|
1081
|
+
effective_max = min_idx
|
|
1082
|
+
for i in range(min_idx, max_idx + 1):
|
|
1083
|
+
item, _ = all_items[i]
|
|
1084
|
+
text = getattr(item, "text", "") or ""
|
|
1085
|
+
char_count += len(text)
|
|
1086
|
+
effective_max = i
|
|
1087
|
+
# Once we've included original items, check char limit
|
|
1088
|
+
if i >= max(indices) and char_count > max_chars:
|
|
1089
|
+
break
|
|
1090
|
+
|
|
1091
|
+
max_idx = effective_max
|
|
1092
|
+
|
|
1093
|
+
return (min_idx, max_idx)
|
|
1094
|
+
|
|
1095
|
+
async def _expand_with_docling(
|
|
1096
|
+
self,
|
|
1097
|
+
results: list[SearchResult],
|
|
1098
|
+
docling_doc,
|
|
1099
|
+
radius: int,
|
|
1100
|
+
max_items: int,
|
|
1101
|
+
max_chars: int,
|
|
1102
|
+
) -> list[SearchResult]:
|
|
1103
|
+
"""Expand results using DoclingDocument structure.
|
|
1104
|
+
|
|
1105
|
+
Structural content (tables, code, lists) expands to complete structures.
|
|
1106
|
+
Text content uses radius-based expansion.
|
|
1107
|
+
"""
|
|
1108
|
+
all_items = list(docling_doc.iterate_items())
|
|
1109
|
+
ref_to_index = {
|
|
1110
|
+
getattr(item, "self_ref", None): i
|
|
1111
|
+
for i, (item, _) in enumerate(all_items)
|
|
1112
|
+
if getattr(item, "self_ref", None)
|
|
1113
|
+
}
|
|
563
1114
|
|
|
564
|
-
|
|
1115
|
+
# Compute expanded ranges
|
|
1116
|
+
ranges: list[tuple[int, int, SearchResult]] = []
|
|
1117
|
+
passthrough: list[SearchResult] = []
|
|
565
1118
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
1119
|
+
for result in results:
|
|
1120
|
+
indices = [
|
|
1121
|
+
ref_to_index[r] for r in result.doc_item_refs if r in ref_to_index
|
|
1122
|
+
]
|
|
1123
|
+
if not indices:
|
|
1124
|
+
passthrough.append(result)
|
|
1125
|
+
continue
|
|
570
1126
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
1127
|
+
min_idx, max_idx = self._compute_type_aware_range(
|
|
1128
|
+
all_items, indices, radius, max_items, max_chars
|
|
1129
|
+
)
|
|
574
1130
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
1131
|
+
ranges.append((min_idx, max_idx, result))
|
|
1132
|
+
|
|
1133
|
+
# Merge overlapping ranges
|
|
1134
|
+
merged = self._merge_ranges(ranges)
|
|
1135
|
+
|
|
1136
|
+
final_results: list[SearchResult] = []
|
|
1137
|
+
for min_idx, max_idx, original_results in merged:
|
|
1138
|
+
content_parts: list[str] = []
|
|
1139
|
+
refs: list[str] = []
|
|
1140
|
+
pages: set[int] = set()
|
|
1141
|
+
labels: set[str] = set()
|
|
1142
|
+
|
|
1143
|
+
for i in range(min_idx, max_idx + 1):
|
|
1144
|
+
item, _ = all_items[i]
|
|
1145
|
+
# Extract text content - handle different item types
|
|
1146
|
+
text = self._extract_item_text(item, docling_doc)
|
|
1147
|
+
if text:
|
|
1148
|
+
content_parts.append(text)
|
|
1149
|
+
if self_ref := getattr(item, "self_ref", None):
|
|
1150
|
+
refs.append(self_ref)
|
|
1151
|
+
if label := getattr(item, "label", None):
|
|
1152
|
+
labels.add(
|
|
1153
|
+
str(label.value) if hasattr(label, "value") else str(label)
|
|
1154
|
+
)
|
|
1155
|
+
if prov := getattr(item, "prov", None):
|
|
1156
|
+
for p in prov:
|
|
1157
|
+
if (page_no := getattr(p, "page_no", None)) is not None:
|
|
1158
|
+
pages.add(page_no)
|
|
1159
|
+
|
|
1160
|
+
# Merge headings preserving order
|
|
1161
|
+
all_headings: list[str] = []
|
|
1162
|
+
for r in original_results:
|
|
1163
|
+
if r.headings:
|
|
1164
|
+
all_headings.extend(h for h in r.headings if h not in all_headings)
|
|
1165
|
+
|
|
1166
|
+
first = original_results[0]
|
|
1167
|
+
final_results.append(
|
|
1168
|
+
SearchResult(
|
|
1169
|
+
content="\n\n".join(content_parts),
|
|
1170
|
+
score=max(r.score for r in original_results),
|
|
1171
|
+
chunk_id=first.chunk_id,
|
|
1172
|
+
document_id=first.document_id,
|
|
1173
|
+
document_uri=first.document_uri,
|
|
1174
|
+
document_title=first.document_title,
|
|
1175
|
+
doc_item_refs=refs,
|
|
1176
|
+
page_numbers=sorted(pages),
|
|
1177
|
+
headings=all_headings or None,
|
|
1178
|
+
labels=sorted(labels),
|
|
1179
|
+
)
|
|
1180
|
+
)
|
|
582
1181
|
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
1182
|
+
return final_results + passthrough
|
|
1183
|
+
|
|
1184
|
+
async def _expand_with_chunks(
|
|
1185
|
+
self,
|
|
1186
|
+
doc_id: str,
|
|
1187
|
+
results: list[SearchResult],
|
|
1188
|
+
radius: int,
|
|
1189
|
+
) -> list[SearchResult]:
|
|
1190
|
+
"""Expand results using chunk-based adjacency."""
|
|
1191
|
+
all_chunks = await self.chunk_repository.get_by_document_id(doc_id)
|
|
1192
|
+
if not all_chunks:
|
|
1193
|
+
return results
|
|
1194
|
+
|
|
1195
|
+
content_to_chunk = {c.content: c for c in all_chunks}
|
|
1196
|
+
chunk_by_order = {c.order: c for c in all_chunks}
|
|
1197
|
+
min_order, max_order = min(chunk_by_order.keys()), max(chunk_by_order.keys())
|
|
1198
|
+
|
|
1199
|
+
# Build ranges
|
|
1200
|
+
ranges: list[tuple[int, int, SearchResult]] = []
|
|
1201
|
+
passthrough: list[SearchResult] = []
|
|
1202
|
+
|
|
1203
|
+
for result in results:
|
|
1204
|
+
chunk = content_to_chunk.get(result.content)
|
|
1205
|
+
if chunk is None:
|
|
1206
|
+
passthrough.append(result)
|
|
1207
|
+
continue
|
|
1208
|
+
start = max(min_order, chunk.order - radius)
|
|
1209
|
+
end = min(max_order, chunk.order + radius)
|
|
1210
|
+
ranges.append((start, end, result))
|
|
1211
|
+
|
|
1212
|
+
# Merge and build results
|
|
1213
|
+
final_results: list[SearchResult] = []
|
|
1214
|
+
for min_idx, max_idx, original_results in self._merge_ranges(ranges):
|
|
1215
|
+
# Collect chunks in order
|
|
1216
|
+
chunks_in_range = [
|
|
1217
|
+
chunk_by_order[o]
|
|
1218
|
+
for o in range(min_idx, max_idx + 1)
|
|
1219
|
+
if o in chunk_by_order
|
|
1220
|
+
]
|
|
1221
|
+
first = original_results[0]
|
|
1222
|
+
final_results.append(
|
|
1223
|
+
SearchResult(
|
|
1224
|
+
content="".join(c.content for c in chunks_in_range),
|
|
1225
|
+
score=max(r.score for r in original_results),
|
|
1226
|
+
chunk_id=first.chunk_id,
|
|
1227
|
+
document_id=first.document_id,
|
|
1228
|
+
document_uri=first.document_uri,
|
|
1229
|
+
document_title=first.document_title,
|
|
1230
|
+
doc_item_refs=first.doc_item_refs,
|
|
1231
|
+
page_numbers=first.page_numbers,
|
|
1232
|
+
headings=first.headings,
|
|
1233
|
+
labels=first.labels,
|
|
589
1234
|
)
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
# Merge all_chunks and deduplicate by order
|
|
594
|
-
all_chunks_dict = {}
|
|
595
|
-
for chunk in current["all_chunks"] + range_info["all_chunks"]:
|
|
596
|
-
order = chunk.order
|
|
597
|
-
all_chunks_dict[order] = chunk
|
|
598
|
-
current["all_chunks"] = [
|
|
599
|
-
all_chunks_dict[order] for order in sorted(all_chunks_dict.keys())
|
|
600
|
-
]
|
|
601
|
-
else:
|
|
602
|
-
# No overlap, add current to merged and start new
|
|
603
|
-
merged.append(current)
|
|
604
|
-
current = {
|
|
605
|
-
"min_order": range_info["min_order"],
|
|
606
|
-
"max_order": range_info["max_order"],
|
|
607
|
-
"original_chunks": [range_info["original_chunk"]],
|
|
608
|
-
"scores": [range_info["score"]],
|
|
609
|
-
"all_chunks": range_info["all_chunks"],
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
# Add the last range
|
|
613
|
-
merged.append(current)
|
|
614
|
-
return merged
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
return final_results + passthrough
|
|
615
1238
|
|
|
616
1239
|
async def ask(
|
|
617
|
-
self,
|
|
618
|
-
|
|
1240
|
+
self,
|
|
1241
|
+
question: str,
|
|
1242
|
+
system_prompt: str | None = None,
|
|
1243
|
+
filter: str | None = None,
|
|
1244
|
+
) -> "tuple[str, list[Citation]]":
|
|
619
1245
|
"""Ask a question using the configured QA agent.
|
|
620
1246
|
|
|
621
1247
|
Args:
|
|
622
1248
|
question: The question to ask.
|
|
623
|
-
cite: Whether to include citations in the response.
|
|
624
1249
|
system_prompt: Optional custom system prompt for the QA agent.
|
|
1250
|
+
filter: SQL WHERE clause to filter documents.
|
|
625
1251
|
|
|
626
1252
|
Returns:
|
|
627
|
-
|
|
1253
|
+
Tuple of (answer text, list of resolved citations).
|
|
628
1254
|
"""
|
|
629
1255
|
from haiku.rag.qa import get_qa_agent
|
|
630
1256
|
|
|
631
|
-
qa_agent = get_qa_agent(
|
|
632
|
-
|
|
633
|
-
)
|
|
634
|
-
return await qa_agent.answer(question)
|
|
1257
|
+
qa_agent = get_qa_agent(self, config=self._config, system_prompt=system_prompt)
|
|
1258
|
+
return await qa_agent.answer(question, filter=filter)
|
|
635
1259
|
|
|
636
|
-
async def
|
|
637
|
-
"""
|
|
1260
|
+
async def visualize_chunk(self, chunk: Chunk) -> list:
|
|
1261
|
+
"""Render page images with bounding box highlights for a chunk.
|
|
638
1262
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
1263
|
+
Gets the DoclingDocument from the chunk's document, resolves bounding boxes
|
|
1264
|
+
from chunk metadata, and renders all pages that contain bounding boxes with
|
|
1265
|
+
yellow/orange highlight overlays.
|
|
642
1266
|
|
|
643
|
-
|
|
644
|
-
|
|
1267
|
+
Args:
|
|
1268
|
+
chunk: The chunk to visualize.
|
|
645
1269
|
|
|
646
|
-
|
|
647
|
-
|
|
1270
|
+
Returns:
|
|
1271
|
+
List of PIL Image objects, one per page with bounding boxes.
|
|
1272
|
+
Empty list if no bounding boxes or page images available.
|
|
648
1273
|
"""
|
|
649
|
-
|
|
650
|
-
|
|
1274
|
+
from copy import deepcopy
|
|
1275
|
+
|
|
1276
|
+
from PIL import ImageDraw
|
|
1277
|
+
|
|
1278
|
+
# Get the document
|
|
1279
|
+
if not chunk.document_id:
|
|
1280
|
+
return []
|
|
1281
|
+
|
|
1282
|
+
doc = await self.document_repository.get_by_id(chunk.document_id)
|
|
1283
|
+
if not doc:
|
|
1284
|
+
return []
|
|
1285
|
+
|
|
1286
|
+
# Get DoclingDocument
|
|
1287
|
+
docling_doc = doc.get_docling_document()
|
|
1288
|
+
if not docling_doc:
|
|
1289
|
+
return []
|
|
1290
|
+
|
|
1291
|
+
# Resolve bounding boxes from chunk metadata
|
|
1292
|
+
chunk_meta = chunk.get_chunk_metadata()
|
|
1293
|
+
bounding_boxes = chunk_meta.resolve_bounding_boxes(docling_doc)
|
|
1294
|
+
if not bounding_boxes:
|
|
1295
|
+
return []
|
|
1296
|
+
|
|
1297
|
+
# Group bounding boxes by page
|
|
1298
|
+
boxes_by_page: dict[int, list] = {}
|
|
1299
|
+
for bbox in bounding_boxes:
|
|
1300
|
+
if bbox.page_no not in boxes_by_page:
|
|
1301
|
+
boxes_by_page[bbox.page_no] = []
|
|
1302
|
+
boxes_by_page[bbox.page_no].append(bbox)
|
|
1303
|
+
|
|
1304
|
+
# Render each page with its bounding boxes
|
|
1305
|
+
images = []
|
|
1306
|
+
for page_no in sorted(boxes_by_page.keys()):
|
|
1307
|
+
if page_no not in docling_doc.pages:
|
|
1308
|
+
continue
|
|
1309
|
+
|
|
1310
|
+
page = docling_doc.pages[page_no]
|
|
1311
|
+
if page.image is None or page.image.pil_image is None:
|
|
1312
|
+
continue
|
|
1313
|
+
|
|
1314
|
+
pil_image = page.image.pil_image
|
|
1315
|
+
page_height = page.size.height
|
|
1316
|
+
|
|
1317
|
+
# Calculate scale factor (image pixels vs document coordinates)
|
|
1318
|
+
scale_x = pil_image.width / page.size.width
|
|
1319
|
+
scale_y = pil_image.height / page.size.height
|
|
651
1320
|
|
|
652
|
-
|
|
653
|
-
|
|
1321
|
+
# Draw bounding boxes
|
|
1322
|
+
image = deepcopy(pil_image)
|
|
1323
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
654
1324
|
|
|
1325
|
+
for bbox in boxes_by_page[page_no]:
|
|
1326
|
+
# Convert from document coordinates to image coordinates
|
|
1327
|
+
# Document coords are bottom-left origin, PIL uses top-left
|
|
1328
|
+
x0 = bbox.left * scale_x
|
|
1329
|
+
y0 = (page_height - bbox.top) * scale_y
|
|
1330
|
+
x1 = bbox.right * scale_x
|
|
1331
|
+
y1 = (page_height - bbox.bottom) * scale_y
|
|
1332
|
+
|
|
1333
|
+
# Ensure proper ordering (y0 should be less than y1 for PIL)
|
|
1334
|
+
if y0 > y1:
|
|
1335
|
+
y0, y1 = y1, y0
|
|
1336
|
+
|
|
1337
|
+
# Draw filled rectangle with transparency
|
|
1338
|
+
fill_color = (255, 255, 0, 80) # Yellow with transparency
|
|
1339
|
+
outline_color = (255, 165, 0, 255) # Orange outline
|
|
1340
|
+
|
|
1341
|
+
draw.rectangle([(x0, y0), (x1, y1)], fill=fill_color, outline=None)
|
|
1342
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=outline_color, width=3)
|
|
1343
|
+
|
|
1344
|
+
images.append(image)
|
|
1345
|
+
|
|
1346
|
+
return images
|
|
1347
|
+
|
|
1348
|
+
async def rebuild_database(
|
|
1349
|
+
self, mode: RebuildMode = RebuildMode.FULL
|
|
1350
|
+
) -> AsyncGenerator[str, None]:
|
|
1351
|
+
"""Rebuild the database with the specified mode.
|
|
1352
|
+
|
|
1353
|
+
Args:
|
|
1354
|
+
mode: The rebuild mode to use:
|
|
1355
|
+
- FULL: Re-convert from source files, re-chunk, re-embed (default)
|
|
1356
|
+
- RECHUNK: Re-chunk from existing content, re-embed (no source access)
|
|
1357
|
+
- EMBED_ONLY: Keep existing chunks, only regenerate embeddings
|
|
1358
|
+
|
|
1359
|
+
Yields:
|
|
1360
|
+
The ID of the document currently being processed.
|
|
1361
|
+
"""
|
|
655
1362
|
# Update settings to current config
|
|
656
1363
|
settings_repo = SettingsRepository(self.store)
|
|
657
1364
|
settings_repo.save_current_settings()
|
|
658
1365
|
|
|
659
1366
|
documents = await self.list_documents()
|
|
660
1367
|
|
|
1368
|
+
if mode == RebuildMode.EMBED_ONLY:
|
|
1369
|
+
async for doc_id in self._rebuild_embed_only(documents):
|
|
1370
|
+
yield doc_id
|
|
1371
|
+
elif mode == RebuildMode.RECHUNK:
|
|
1372
|
+
await self.chunk_repository.delete_all()
|
|
1373
|
+
self.store.recreate_embeddings_table()
|
|
1374
|
+
async for doc_id in self._rebuild_rechunk(documents):
|
|
1375
|
+
yield doc_id
|
|
1376
|
+
else: # FULL
|
|
1377
|
+
await self.chunk_repository.delete_all()
|
|
1378
|
+
self.store.recreate_embeddings_table()
|
|
1379
|
+
async for doc_id in self._rebuild_full(documents):
|
|
1380
|
+
yield doc_id
|
|
1381
|
+
|
|
1382
|
+
# Final maintenance if auto_vacuum enabled
|
|
1383
|
+
if self._config.storage.auto_vacuum:
|
|
1384
|
+
try:
|
|
1385
|
+
await self.store.vacuum()
|
|
1386
|
+
except Exception:
|
|
1387
|
+
pass
|
|
1388
|
+
|
|
1389
|
+
async def _rebuild_embed_only(
|
|
1390
|
+
self, documents: list[Document]
|
|
1391
|
+
) -> AsyncGenerator[str, None]:
|
|
1392
|
+
"""Re-embed all chunks without changing chunk boundaries."""
|
|
1393
|
+
from haiku.rag.embeddings import contextualize
|
|
1394
|
+
|
|
1395
|
+
batch_size = 50
|
|
1396
|
+
pending_records: list = []
|
|
1397
|
+
pending_doc_ids: list[str] = []
|
|
1398
|
+
|
|
661
1399
|
for doc in documents:
|
|
662
|
-
assert doc.id is not None
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
1400
|
+
assert doc.id is not None
|
|
1401
|
+
|
|
1402
|
+
# Get existing chunks
|
|
1403
|
+
chunks = await self.chunk_repository.get_by_document_id(doc.id)
|
|
1404
|
+
if not chunks:
|
|
1405
|
+
yield doc.id
|
|
1406
|
+
continue
|
|
1407
|
+
|
|
1408
|
+
# Generate new embeddings using contextualize for consistency
|
|
1409
|
+
texts = contextualize(chunks)
|
|
1410
|
+
embeddings = await self.chunk_repository.embedder.embed_documents(texts)
|
|
1411
|
+
|
|
1412
|
+
# Build updated records
|
|
1413
|
+
for chunk, content_fts, embedding in zip(chunks, texts, embeddings):
|
|
1414
|
+
pending_records.append(
|
|
1415
|
+
self.store.ChunkRecord(
|
|
1416
|
+
id=chunk.id, # type: ignore[arg-type]
|
|
1417
|
+
document_id=chunk.document_id, # type: ignore[arg-type]
|
|
1418
|
+
content=chunk.content,
|
|
1419
|
+
content_fts=content_fts,
|
|
1420
|
+
metadata=json.dumps(chunk.metadata),
|
|
1421
|
+
order=chunk.order,
|
|
1422
|
+
vector=embedding,
|
|
1423
|
+
)
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
pending_doc_ids.append(doc.id)
|
|
1427
|
+
|
|
1428
|
+
# Flush batch when size reached
|
|
1429
|
+
if len(pending_doc_ids) >= batch_size:
|
|
1430
|
+
if pending_records:
|
|
1431
|
+
self.store.chunks_table.merge_insert(
|
|
1432
|
+
"id"
|
|
1433
|
+
).when_matched_update_all().execute(pending_records)
|
|
1434
|
+
for doc_id in pending_doc_ids:
|
|
1435
|
+
yield doc_id
|
|
1436
|
+
pending_records = []
|
|
1437
|
+
pending_doc_ids = []
|
|
1438
|
+
|
|
1439
|
+
# Flush remaining
|
|
1440
|
+
if pending_records:
|
|
1441
|
+
self.store.chunks_table.merge_insert(
|
|
1442
|
+
"id"
|
|
1443
|
+
).when_matched_update_all().execute(pending_records)
|
|
1444
|
+
for doc_id in pending_doc_ids:
|
|
1445
|
+
yield doc_id
|
|
1446
|
+
|
|
1447
|
+
async def _flush_rebuild_batch(
|
|
1448
|
+
self, documents: list[Document], chunks: list[Chunk]
|
|
1449
|
+
) -> None:
|
|
1450
|
+
"""Batch write documents and chunks during rebuild.
|
|
1451
|
+
|
|
1452
|
+
This performs two writes: one for all document updates, one for all chunks.
|
|
1453
|
+
Used by RECHUNK and FULL modes after the chunks table has been cleared.
|
|
1454
|
+
"""
|
|
1455
|
+
from haiku.rag.store.engine import DocumentRecord
|
|
1456
|
+
from haiku.rag.store.models.document import invalidate_docling_document_cache
|
|
1457
|
+
|
|
1458
|
+
if not documents:
|
|
1459
|
+
return
|
|
1460
|
+
|
|
1461
|
+
now = datetime.now().isoformat()
|
|
1462
|
+
|
|
1463
|
+
# Invalidate cache for all documents being updated
|
|
1464
|
+
for doc in documents:
|
|
1465
|
+
if doc.id:
|
|
1466
|
+
invalidate_docling_document_cache(doc.id)
|
|
1467
|
+
|
|
1468
|
+
# Batch update documents using merge_insert (single LanceDB version)
|
|
1469
|
+
doc_records = [
|
|
1470
|
+
DocumentRecord(
|
|
1471
|
+
id=doc.id, # type: ignore[arg-type]
|
|
1472
|
+
content=doc.content,
|
|
1473
|
+
uri=doc.uri,
|
|
1474
|
+
title=doc.title,
|
|
1475
|
+
metadata=json.dumps(doc.metadata),
|
|
1476
|
+
docling_document_json=doc.docling_document_json,
|
|
1477
|
+
docling_version=doc.docling_version,
|
|
1478
|
+
created_at=doc.created_at.isoformat() if doc.created_at else now,
|
|
1479
|
+
updated_at=now,
|
|
1480
|
+
)
|
|
1481
|
+
for doc in documents
|
|
1482
|
+
]
|
|
1483
|
+
|
|
1484
|
+
self.store.documents_table.merge_insert("id").when_matched_update_all().execute(
|
|
1485
|
+
doc_records
|
|
1486
|
+
)
|
|
1487
|
+
|
|
1488
|
+
# Batch create all chunks (single LanceDB version)
|
|
1489
|
+
if chunks:
|
|
1490
|
+
await self.chunk_repository.create(chunks)
|
|
1491
|
+
|
|
1492
|
+
async def _rebuild_rechunk(
|
|
1493
|
+
self, documents: list[Document]
|
|
1494
|
+
) -> AsyncGenerator[str, None]:
|
|
1495
|
+
"""Re-chunk and re-embed from existing document content."""
|
|
1496
|
+
from haiku.rag.embeddings import embed_chunks
|
|
1497
|
+
|
|
1498
|
+
batch_size = 50
|
|
1499
|
+
pending_chunks: list[Chunk] = []
|
|
1500
|
+
pending_docs: list[Document] = []
|
|
1501
|
+
pending_doc_ids: list[str] = []
|
|
1502
|
+
|
|
1503
|
+
for doc in documents:
|
|
1504
|
+
assert doc.id is not None
|
|
1505
|
+
|
|
1506
|
+
# Convert content to DoclingDocument
|
|
1507
|
+
docling_document = await self.convert(doc.content)
|
|
1508
|
+
|
|
1509
|
+
# Chunk and embed
|
|
1510
|
+
chunks = await self.chunk(docling_document)
|
|
1511
|
+
embedded_chunks = await embed_chunks(chunks, self._config)
|
|
1512
|
+
|
|
1513
|
+
# Update document fields
|
|
1514
|
+
doc.docling_document_json = docling_document.model_dump_json()
|
|
1515
|
+
doc.docling_version = docling_document.version
|
|
1516
|
+
|
|
1517
|
+
# Prepare chunks with document_id and order
|
|
1518
|
+
for order, chunk in enumerate(embedded_chunks):
|
|
1519
|
+
chunk.document_id = doc.id
|
|
1520
|
+
chunk.order = order
|
|
1521
|
+
|
|
1522
|
+
pending_chunks.extend(embedded_chunks)
|
|
1523
|
+
pending_docs.append(doc)
|
|
1524
|
+
pending_doc_ids.append(doc.id)
|
|
1525
|
+
|
|
1526
|
+
# Flush batch when size reached
|
|
1527
|
+
if len(pending_docs) >= batch_size:
|
|
1528
|
+
await self._flush_rebuild_batch(pending_docs, pending_chunks)
|
|
1529
|
+
for doc_id in pending_doc_ids:
|
|
1530
|
+
yield doc_id
|
|
1531
|
+
pending_chunks = []
|
|
1532
|
+
pending_docs = []
|
|
1533
|
+
pending_doc_ids = []
|
|
1534
|
+
|
|
1535
|
+
# Flush remaining
|
|
1536
|
+
if pending_docs:
|
|
1537
|
+
await self._flush_rebuild_batch(pending_docs, pending_chunks)
|
|
1538
|
+
for doc_id in pending_doc_ids:
|
|
1539
|
+
yield doc_id
|
|
1540
|
+
|
|
1541
|
+
async def _rebuild_full(
|
|
1542
|
+
self, documents: list[Document]
|
|
1543
|
+
) -> AsyncGenerator[str, None]:
|
|
1544
|
+
"""Full rebuild: re-convert from source, re-chunk, re-embed."""
|
|
1545
|
+
from haiku.rag.embeddings import embed_chunks
|
|
1546
|
+
|
|
1547
|
+
batch_size = 50
|
|
1548
|
+
pending_chunks: list[Chunk] = []
|
|
1549
|
+
pending_docs: list[Document] = []
|
|
1550
|
+
pending_doc_ids: list[str] = []
|
|
667
1551
|
|
|
1552
|
+
for doc in documents:
|
|
1553
|
+
assert doc.id is not None
|
|
1554
|
+
|
|
1555
|
+
# Try to rebuild from source if available
|
|
1556
|
+
if doc.uri and self._check_source_accessible(doc.uri):
|
|
668
1557
|
try:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
if source_accessible:
|
|
682
|
-
# Source exists - delete and recreate from source
|
|
683
|
-
try:
|
|
684
|
-
await self.delete_document(doc.id)
|
|
685
|
-
new_doc = await self.create_document_from_source(
|
|
686
|
-
source=doc.uri, metadata=doc.metadata or {}
|
|
687
|
-
)
|
|
688
|
-
# URIs always point to single files/URLs, never directories
|
|
689
|
-
assert isinstance(new_doc, Document)
|
|
690
|
-
assert new_doc.id is not None, (
|
|
691
|
-
"New document ID should not be None"
|
|
692
|
-
)
|
|
693
|
-
yield new_doc.id
|
|
694
|
-
except Exception as e:
|
|
695
|
-
logger.error(
|
|
696
|
-
"Error recreating document from source %s: %s",
|
|
697
|
-
doc.uri,
|
|
698
|
-
e,
|
|
699
|
-
)
|
|
700
|
-
continue
|
|
701
|
-
else:
|
|
702
|
-
# Source missing - re-embed from existing content
|
|
703
|
-
logger.warning(
|
|
704
|
-
"Source missing for %s, re-embedding from content", doc.uri
|
|
1558
|
+
# Flush pending batch before source rebuild (creates new doc)
|
|
1559
|
+
if pending_docs:
|
|
1560
|
+
await self._flush_rebuild_batch(pending_docs, pending_chunks)
|
|
1561
|
+
for doc_id in pending_doc_ids:
|
|
1562
|
+
yield doc_id
|
|
1563
|
+
pending_chunks = []
|
|
1564
|
+
pending_docs = []
|
|
1565
|
+
pending_doc_ids = []
|
|
1566
|
+
|
|
1567
|
+
await self.delete_document(doc.id)
|
|
1568
|
+
new_doc = await self.create_document_from_source(
|
|
1569
|
+
source=doc.uri, metadata=doc.metadata or {}
|
|
705
1570
|
)
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
1571
|
+
assert isinstance(new_doc, Document)
|
|
1572
|
+
assert new_doc.id is not None
|
|
1573
|
+
yield new_doc.id
|
|
1574
|
+
continue
|
|
1575
|
+
except Exception as e:
|
|
1576
|
+
logger.error(
|
|
1577
|
+
"Error recreating document from source %s: %s",
|
|
1578
|
+
doc.uri,
|
|
1579
|
+
e,
|
|
709
1580
|
)
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
doc.
|
|
1581
|
+
continue
|
|
1582
|
+
|
|
1583
|
+
# Fallback: rebuild from stored content
|
|
1584
|
+
if doc.uri:
|
|
1585
|
+
logger.warning(
|
|
1586
|
+
"Source missing for %s, re-embedding from content", doc.uri
|
|
716
1587
|
)
|
|
717
|
-
yield doc.id
|
|
718
1588
|
|
|
719
|
-
|
|
1589
|
+
docling_document = await self.convert(doc.content)
|
|
1590
|
+
chunks = await self.chunk(docling_document)
|
|
1591
|
+
embedded_chunks = await embed_chunks(chunks, self._config)
|
|
1592
|
+
|
|
1593
|
+
doc.docling_document_json = docling_document.model_dump_json()
|
|
1594
|
+
doc.docling_version = docling_document.version
|
|
1595
|
+
|
|
1596
|
+
# Prepare chunks with document_id and order
|
|
1597
|
+
for order, chunk in enumerate(embedded_chunks):
|
|
1598
|
+
chunk.document_id = doc.id
|
|
1599
|
+
chunk.order = order
|
|
1600
|
+
|
|
1601
|
+
pending_chunks.extend(embedded_chunks)
|
|
1602
|
+
pending_docs.append(doc)
|
|
1603
|
+
pending_doc_ids.append(doc.id)
|
|
1604
|
+
|
|
1605
|
+
# Flush batch when size reached
|
|
1606
|
+
if len(pending_docs) >= batch_size:
|
|
1607
|
+
await self._flush_rebuild_batch(pending_docs, pending_chunks)
|
|
1608
|
+
for doc_id in pending_doc_ids:
|
|
1609
|
+
yield doc_id
|
|
1610
|
+
pending_chunks = []
|
|
1611
|
+
pending_docs = []
|
|
1612
|
+
pending_doc_ids = []
|
|
1613
|
+
|
|
1614
|
+
# Flush remaining
|
|
1615
|
+
if pending_docs:
|
|
1616
|
+
await self._flush_rebuild_batch(pending_docs, pending_chunks)
|
|
1617
|
+
for doc_id in pending_doc_ids:
|
|
1618
|
+
yield doc_id
|
|
1619
|
+
|
|
1620
|
+
def _check_source_accessible(self, uri: str) -> bool:
|
|
1621
|
+
"""Check if a document's source URI is accessible."""
|
|
1622
|
+
parsed_url = urlparse(uri)
|
|
720
1623
|
try:
|
|
721
|
-
|
|
1624
|
+
if parsed_url.scheme == "file":
|
|
1625
|
+
return Path(parsed_url.path).exists()
|
|
1626
|
+
elif parsed_url.scheme in ("http", "https"):
|
|
1627
|
+
return True
|
|
1628
|
+
return False
|
|
722
1629
|
except Exception:
|
|
723
|
-
|
|
1630
|
+
return False
|
|
724
1631
|
|
|
725
1632
|
async def vacuum(self) -> None:
|
|
726
1633
|
"""Optimize and clean up old versions across all tables."""
|
|
727
1634
|
await self.store.vacuum()
|
|
728
1635
|
|
|
1636
|
+
async def download_models(self) -> AsyncGenerator[DownloadProgress, None]:
|
|
1637
|
+
"""Download required models, yielding progress events.
|
|
1638
|
+
|
|
1639
|
+
Yields DownloadProgress events for:
|
|
1640
|
+
- Docling models (status="docling_start", "docling_done")
|
|
1641
|
+
- HuggingFace tokenizer (status="tokenizer_start", "tokenizer_done")
|
|
1642
|
+
- Ollama models (status="pulling", "downloading", "done", or other Ollama statuses)
|
|
1643
|
+
"""
|
|
1644
|
+
# Docling models
|
|
1645
|
+
try:
|
|
1646
|
+
from docling.utils.model_downloader import download_models
|
|
1647
|
+
|
|
1648
|
+
yield DownloadProgress(model="docling", status="start")
|
|
1649
|
+
await asyncio.to_thread(download_models)
|
|
1650
|
+
yield DownloadProgress(model="docling", status="done")
|
|
1651
|
+
except ImportError:
|
|
1652
|
+
pass
|
|
1653
|
+
|
|
1654
|
+
# HuggingFace tokenizer
|
|
1655
|
+
from transformers import AutoTokenizer
|
|
1656
|
+
|
|
1657
|
+
tokenizer_name = self._config.processing.chunking_tokenizer
|
|
1658
|
+
yield DownloadProgress(model=tokenizer_name, status="start")
|
|
1659
|
+
await asyncio.to_thread(AutoTokenizer.from_pretrained, tokenizer_name)
|
|
1660
|
+
yield DownloadProgress(model=tokenizer_name, status="done")
|
|
1661
|
+
|
|
1662
|
+
# Collect Ollama models from config
|
|
1663
|
+
required_models: set[str] = set()
|
|
1664
|
+
if self._config.embeddings.model.provider == "ollama":
|
|
1665
|
+
required_models.add(self._config.embeddings.model.name)
|
|
1666
|
+
if self._config.qa.model.provider == "ollama":
|
|
1667
|
+
required_models.add(self._config.qa.model.name)
|
|
1668
|
+
if self._config.research.model.provider == "ollama":
|
|
1669
|
+
required_models.add(self._config.research.model.name)
|
|
1670
|
+
if (
|
|
1671
|
+
self._config.reranking.model
|
|
1672
|
+
and self._config.reranking.model.provider == "ollama"
|
|
1673
|
+
):
|
|
1674
|
+
required_models.add(self._config.reranking.model.name)
|
|
1675
|
+
pic_desc = self._config.processing.conversion_options.picture_description
|
|
1676
|
+
if pic_desc.enabled and pic_desc.model.provider == "ollama":
|
|
1677
|
+
required_models.add(pic_desc.model.name)
|
|
1678
|
+
|
|
1679
|
+
if not required_models:
|
|
1680
|
+
return
|
|
1681
|
+
|
|
1682
|
+
base_url = self._config.providers.ollama.base_url
|
|
1683
|
+
|
|
1684
|
+
async with httpx.AsyncClient(timeout=None) as client:
|
|
1685
|
+
for model in sorted(required_models):
|
|
1686
|
+
yield DownloadProgress(model=model, status="pulling")
|
|
1687
|
+
|
|
1688
|
+
async with client.stream(
|
|
1689
|
+
"POST", f"{base_url}/api/pull", json={"model": model}
|
|
1690
|
+
) as r:
|
|
1691
|
+
async for line in r.aiter_lines():
|
|
1692
|
+
if not line:
|
|
1693
|
+
continue
|
|
1694
|
+
try:
|
|
1695
|
+
data = json.loads(line)
|
|
1696
|
+
status = data.get("status", "")
|
|
1697
|
+
digest = data.get("digest", "")
|
|
1698
|
+
|
|
1699
|
+
if digest and "total" in data:
|
|
1700
|
+
yield DownloadProgress(
|
|
1701
|
+
model=model,
|
|
1702
|
+
status="downloading",
|
|
1703
|
+
total=data.get("total", 0),
|
|
1704
|
+
completed=data.get("completed", 0),
|
|
1705
|
+
digest=digest,
|
|
1706
|
+
)
|
|
1707
|
+
elif status:
|
|
1708
|
+
yield DownloadProgress(model=model, status=status)
|
|
1709
|
+
except json.JSONDecodeError:
|
|
1710
|
+
pass
|
|
1711
|
+
|
|
1712
|
+
yield DownloadProgress(model=model, status="done")
|
|
1713
|
+
|
|
729
1714
|
def close(self):
|
|
730
1715
|
"""Close the underlying store connection."""
|
|
731
1716
|
self.store.close()
|