haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/client.py CHANGED
@@ -1,25 +1,56 @@
1
+ import asyncio
1
2
  import hashlib
3
+ import json
2
4
  import logging
3
5
  import mimetypes
4
6
  import tempfile
5
7
  from collections.abc import AsyncGenerator
8
+ from dataclasses import dataclass
9
+ from datetime import datetime
10
+ from enum import Enum
6
11
  from pathlib import Path
12
+ from typing import TYPE_CHECKING, overload
7
13
  from urllib.parse import urlparse
8
14
 
9
15
  import httpx
10
16
 
11
17
  from haiku.rag.config import AppConfig, Config
18
+ from haiku.rag.converters import get_converter
12
19
  from haiku.rag.reranking import get_reranker
13
20
  from haiku.rag.store.engine import Store
14
- from haiku.rag.store.models.chunk import Chunk
21
+ from haiku.rag.store.models.chunk import Chunk, SearchResult
15
22
  from haiku.rag.store.models.document import Document
16
23
  from haiku.rag.store.repositories.chunk import ChunkRepository
17
24
  from haiku.rag.store.repositories.document import DocumentRepository
18
25
  from haiku.rag.store.repositories.settings import SettingsRepository
19
26
 
27
+ if TYPE_CHECKING:
28
+ from docling_core.types.doc.document import DoclingDocument
29
+
30
+ from haiku.rag.graph.research.models import Citation
31
+
20
32
  logger = logging.getLogger(__name__)
21
33
 
22
34
 
35
+ class RebuildMode(Enum):
36
+ """Mode for rebuilding the database."""
37
+
38
+ FULL = "full" # Re-convert from source, re-chunk, re-embed
39
+ RECHUNK = "rechunk" # Re-chunk from existing content, re-embed
40
+ EMBED_ONLY = "embed_only" # Keep chunks, only regenerate embeddings
41
+
42
+
43
+ @dataclass
44
+ class DownloadProgress:
45
+ """Progress event for model downloads."""
46
+
47
+ model: str
48
+ status: str
49
+ completed: int = 0
50
+ total: int = 0
51
+ digest: str = ""
52
+
53
+
23
54
  class HaikuRAG:
24
55
  """High-level haiku-rag client."""
25
56
 
@@ -28,7 +59,9 @@ class HaikuRAG:
28
59
  db_path: Path | None = None,
29
60
  config: AppConfig = Config,
30
61
  skip_validation: bool = False,
31
- allow_create: bool = True,
62
+ create: bool = False,
63
+ read_only: bool = False,
64
+ before: datetime | None = None,
32
65
  ):
33
66
  """Initialize the RAG client with a database path.
34
67
 
@@ -36,21 +69,31 @@ class HaikuRAG:
36
69
  db_path: Path to the database file. If None, uses config.storage.data_dir.
37
70
  config: Configuration to use. Defaults to global Config.
38
71
  skip_validation: Whether to skip configuration validation on database load.
39
- allow_create: Whether to allow database creation. If False, will raise error
40
- if database doesn't exist (for read operations).
72
+ create: Whether to create the database if it doesn't exist.
73
+ read_only: Whether to open the database in read-only mode.
74
+ before: Query the database as it existed at this datetime.
75
+ Implies read_only=True.
41
76
  """
42
77
  self._config = config
43
78
  if db_path is None:
44
79
  db_path = self._config.storage.data_dir / "haiku.rag.lancedb"
80
+
45
81
  self.store = Store(
46
82
  db_path,
47
83
  config=self._config,
48
84
  skip_validation=skip_validation,
49
- allow_create=allow_create,
85
+ create=create,
86
+ read_only=read_only,
87
+ before=before,
50
88
  )
51
89
  self.document_repository = DocumentRepository(self.store)
52
90
  self.chunk_repository = ChunkRepository(self.store)
53
91
 
92
+ @property
93
+ def is_read_only(self) -> bool:
94
+ """Whether the client is in read-only mode."""
95
+ return self.store.is_read_only
96
+
54
97
  async def __aenter__(self):
55
98
  """Async context manager entry."""
56
99
  return self
@@ -63,65 +106,322 @@ class HaikuRAG:
63
106
  self.close()
64
107
  return False
65
108
 
66
- async def _create_document_with_docling(
109
+ # =========================================================================
110
+ # Processing Primitives
111
+ # =========================================================================
112
+
113
+ @overload
114
+ async def convert(self, source: Path) -> "DoclingDocument": ...
115
+
116
+ @overload
117
+ async def convert(
118
+ self, source: str, *, format: str = "md"
119
+ ) -> "DoclingDocument": ...
120
+
121
+ async def convert(
122
+ self, source: Path | str, *, format: str = "md"
123
+ ) -> "DoclingDocument":
124
+ """Convert a file, URL, or text to DoclingDocument.
125
+
126
+ Args:
127
+ source: One of:
128
+ - Path: Local file path to convert
129
+ - str (URL): HTTP/HTTPS URL to download and convert
130
+ - str (text): Raw text content to convert
131
+ format: The format of text content ("md", "html", or "plain").
132
+ Defaults to "md". Use "plain" for plain text without parsing.
133
+ Only used when source is raw text (not a file path or URL).
134
+ Files and URLs determine format from extension/content-type.
135
+
136
+ Returns:
137
+ DoclingDocument from the converted source.
138
+
139
+ Raises:
140
+ ValueError: If the file doesn't exist or has unsupported extension.
141
+ httpx.RequestError: If URL download fails.
142
+ """
143
+ converter = get_converter(self._config)
144
+
145
+ # Path object - convert file directly
146
+ if isinstance(source, Path):
147
+ if not source.exists():
148
+ raise ValueError(f"File does not exist: {source}")
149
+ if source.suffix.lower() not in converter.supported_extensions:
150
+ raise ValueError(f"Unsupported file extension: {source.suffix}")
151
+ return await converter.convert_file(source)
152
+
153
+ # String - check if URL or text
154
+ parsed = urlparse(source)
155
+
156
+ if parsed.scheme in ("http", "https"):
157
+ # URL - download and convert
158
+ async with httpx.AsyncClient() as http:
159
+ response = await http.get(source)
160
+ response.raise_for_status()
161
+
162
+ content_type = response.headers.get("content-type", "").lower()
163
+ file_extension = self._get_extension_from_content_type_or_url(
164
+ source, content_type
165
+ )
166
+
167
+ if file_extension not in converter.supported_extensions:
168
+ raise ValueError(
169
+ f"Unsupported content type/extension: {content_type}/{file_extension}"
170
+ )
171
+
172
+ with tempfile.NamedTemporaryFile(
173
+ mode="wb", suffix=file_extension, delete=False
174
+ ) as temp_file:
175
+ temp_file.write(response.content)
176
+ temp_file.flush()
177
+ temp_path = Path(temp_file.name)
178
+
179
+ try:
180
+ return await converter.convert_file(temp_path)
181
+ finally:
182
+ temp_path.unlink(missing_ok=True)
183
+
184
+ elif parsed.scheme == "file":
185
+ # file:// URI
186
+ file_path = Path(parsed.path)
187
+ if not file_path.exists():
188
+ raise ValueError(f"File does not exist: {file_path}")
189
+ if file_path.suffix.lower() not in converter.supported_extensions:
190
+ raise ValueError(f"Unsupported file extension: {file_path.suffix}")
191
+ return await converter.convert_file(file_path)
192
+
193
+ else:
194
+ # Treat as text content
195
+ return await converter.convert_text(source, format=format)
196
+
197
+ async def chunk(self, docling_document: "DoclingDocument") -> list[Chunk]:
198
+ """Chunk a DoclingDocument into Chunks.
199
+
200
+ Args:
201
+ docling_document: The DoclingDocument to chunk.
202
+
203
+ Returns:
204
+ List of Chunk objects (without embeddings, without document_id).
205
+ Each chunk has its `order` field set to its position in the list.
206
+ """
207
+ from haiku.rag.chunkers import get_chunker
208
+
209
+ chunker = get_chunker(self._config)
210
+ return await chunker.chunk(docling_document)
211
+
212
+ async def _ensure_chunks_embedded(self, chunks: list[Chunk]) -> list[Chunk]:
213
+ """Ensure all chunks have embeddings, embedding any that don't.
214
+
215
+ Args:
216
+ chunks: List of chunks, some may have embeddings already.
217
+
218
+ Returns:
219
+ List of chunks with all embeddings populated.
220
+ """
221
+ from haiku.rag.embeddings import embed_chunks
222
+
223
+ # Find chunks that need embedding
224
+ chunks_to_embed = [c for c in chunks if c.embedding is None]
225
+
226
+ if not chunks_to_embed:
227
+ return chunks
228
+
229
+ # Embed chunks that don't have embeddings (returns new Chunk objects)
230
+ embedded = await embed_chunks(chunks_to_embed, self._config)
231
+
232
+ # Build result maintaining original order
233
+ embedded_map = {(c.content, c.order): c for c in embedded}
234
+ result = []
235
+ for chunk in chunks:
236
+ if chunk.embedding is not None:
237
+ result.append(chunk)
238
+ else:
239
+ result.append(embedded_map[(chunk.content, chunk.order)])
240
+
241
+ return result
242
+
243
+ async def _store_document_with_chunks(
244
+ self,
245
+ document: Document,
246
+ chunks: list[Chunk],
247
+ ) -> Document:
248
+ """Store a document with chunks, embedding any that lack embeddings.
249
+
250
+ Handles versioning/rollback on failure.
251
+
252
+ Args:
253
+ document: The document to store (will be created).
254
+ chunks: Chunks to store (will be embedded if lacking embeddings).
255
+
256
+ Returns:
257
+ The created Document instance with ID set.
258
+ """
259
+ import asyncio
260
+
261
+ # Ensure all chunks have embeddings before storing
262
+ chunks = await self._ensure_chunks_embedded(chunks)
263
+
264
+ # Snapshot table versions for versioned rollback (if supported)
265
+ versions = self.store.current_table_versions()
266
+
267
+ # Create the document
268
+ created_doc = await self.document_repository.create(document)
269
+
270
+ try:
271
+ assert created_doc.id is not None, (
272
+ "Document ID should not be None after creation"
273
+ )
274
+ # Set document_id and order for all chunks
275
+ for order, chunk in enumerate(chunks):
276
+ chunk.document_id = created_doc.id
277
+ chunk.order = order
278
+
279
+ # Batch create all chunks in a single operation
280
+ await self.chunk_repository.create(chunks)
281
+
282
+ # Vacuum old versions in background (non-blocking) if auto_vacuum enabled
283
+ if self._config.storage.auto_vacuum:
284
+ asyncio.create_task(self.store.vacuum())
285
+
286
+ return created_doc
287
+ except Exception:
288
+ # Roll back to the captured versions and re-raise
289
+ self.store.restore_table_versions(versions)
290
+ raise
291
+
292
+ async def _update_document_with_chunks(
293
+ self,
294
+ document: Document,
295
+ chunks: list[Chunk],
296
+ ) -> Document:
297
+ """Update a document and replace its chunks, embedding any that lack embeddings.
298
+
299
+ Handles versioning/rollback on failure.
300
+
301
+ Args:
302
+ document: The document to update (must have ID set).
303
+ chunks: Chunks to replace existing (will be embedded if lacking embeddings).
304
+
305
+ Returns:
306
+ The updated Document instance.
307
+ """
308
+ import asyncio
309
+
310
+ assert document.id is not None, "Document ID is required for update"
311
+
312
+ # Ensure all chunks have embeddings before storing
313
+ chunks = await self._ensure_chunks_embedded(chunks)
314
+
315
+ # Snapshot table versions for versioned rollback
316
+ versions = self.store.current_table_versions()
317
+
318
+ # Delete existing chunks before writing new ones
319
+ await self.chunk_repository.delete_by_document_id(document.id)
320
+
321
+ try:
322
+ # Update the document
323
+ updated_doc = await self.document_repository.update(document)
324
+
325
+ # Set document_id and order for all chunks
326
+ assert updated_doc.id is not None
327
+ for order, chunk in enumerate(chunks):
328
+ chunk.document_id = updated_doc.id
329
+ chunk.order = order
330
+
331
+ # Batch create all chunks in a single operation
332
+ await self.chunk_repository.create(chunks)
333
+
334
+ # Vacuum old versions in background (non-blocking) if auto_vacuum enabled
335
+ if self._config.storage.auto_vacuum:
336
+ asyncio.create_task(self.store.vacuum())
337
+
338
+ return updated_doc
339
+ except Exception:
340
+ # Roll back to the captured versions and re-raise
341
+ self.store.restore_table_versions(versions)
342
+ raise
343
+
344
+ async def create_document(
67
345
  self,
68
- docling_document,
346
+ content: str,
69
347
  uri: str | None = None,
70
348
  title: str | None = None,
71
349
  metadata: dict | None = None,
72
- chunks: list[Chunk] | None = None,
350
+ format: str = "md",
73
351
  ) -> Document:
74
- """Create a new document from DoclingDocument."""
75
- content = docling_document.export_to_markdown()
352
+ """Create a new document from text content.
353
+
354
+ Converts the content, chunks it, and generates embeddings.
355
+
356
+ Args:
357
+ content: The text content of the document.
358
+ uri: Optional URI identifier for the document.
359
+ title: Optional title for the document.
360
+ metadata: Optional metadata dictionary.
361
+ format: The format of the content ("md", "html", or "plain").
362
+ Defaults to "md". Use "plain" for plain text without parsing.
363
+
364
+ Returns:
365
+ The created Document instance.
366
+ """
367
+ from haiku.rag.embeddings import embed_chunks
368
+
369
+ # Convert → Chunk → Embed using primitives
370
+ docling_document = await self.convert(content, format=format)
371
+ chunks = await self.chunk(docling_document)
372
+ embedded_chunks = await embed_chunks(chunks, self._config)
373
+
374
+ # Store markdown export as content for better display/readability
375
+ # The original content is preserved in docling_document_json
376
+ stored_content = docling_document.export_to_markdown()
377
+
378
+ # Create document model
76
379
  document = Document(
77
- content=content,
380
+ content=stored_content,
78
381
  uri=uri,
79
382
  title=title,
80
383
  metadata=metadata or {},
81
- )
82
- return await self.document_repository._create_and_chunk(
83
- document, docling_document, chunks
384
+ docling_document_json=docling_document.model_dump_json(),
385
+ docling_version=docling_document.version,
84
386
  )
85
387
 
86
- async def create_document(
388
+ # Store document and chunks
389
+ return await self._store_document_with_chunks(document, embedded_chunks)
390
+
391
+ async def import_document(
87
392
  self,
88
- content: str,
393
+ docling_document: "DoclingDocument",
394
+ chunks: list[Chunk],
89
395
  uri: str | None = None,
90
396
  title: str | None = None,
91
397
  metadata: dict | None = None,
92
- chunks: list[Chunk] | None = None,
93
398
  ) -> Document:
94
- """Create a new document with optional URI and metadata.
399
+ """Import a pre-processed document with chunks.
400
+
401
+ Use this when document conversion, chunking, and embedding were done
402
+ externally and you want to store the results in haiku.rag.
95
403
 
96
404
  Args:
97
- content: The text content of the document.
405
+ docling_document: The DoclingDocument to import.
406
+ chunks: Pre-created chunks. Chunks without embeddings will be
407
+ automatically embedded.
98
408
  uri: Optional URI identifier for the document.
409
+ title: Optional title for the document.
99
410
  metadata: Optional metadata dictionary.
100
- chunks: Optional list of pre-created chunks to use instead of generating new ones.
101
411
 
102
412
  Returns:
103
413
  The created Document instance.
104
414
  """
105
415
  document = Document(
106
- content=content,
416
+ content=docling_document.export_to_markdown(),
107
417
  uri=uri,
108
418
  title=title,
109
419
  metadata=metadata or {},
420
+ docling_document_json=docling_document.model_dump_json(),
421
+ docling_version=docling_document.version,
110
422
  )
111
423
 
112
- # Only create docling_document if we need to generate chunks
113
- if chunks is None:
114
- # Lazy import to avoid loading docling
115
- from haiku.rag.utils import text_to_docling_document
116
-
117
- docling_document = text_to_docling_document(content)
118
- else:
119
- # Chunks already provided, no conversion needed
120
- docling_document = None
121
-
122
- return await self.document_repository._create_and_chunk(
123
- document, docling_document, chunks
124
- )
424
+ return await self._store_document_with_chunks(document, chunks)
125
425
 
126
426
  async def create_document_from_source(
127
427
  self, source: str | Path, title: str | None = None, metadata: dict | None = None
@@ -201,12 +501,12 @@ class HaikuRAG:
201
501
  Raises:
202
502
  ValueError: If the file cannot be parsed or doesn't exist
203
503
  """
204
- # Lazy import to avoid loading docling
205
- from haiku.rag.reader import FileReader
504
+ from haiku.rag.embeddings import embed_chunks
206
505
 
207
506
  metadata = metadata or {}
208
507
 
209
- if source_path.suffix.lower() not in FileReader.extensions:
508
+ converter = get_converter(self._config)
509
+ if source_path.suffix.lower() not in converter.supported_extensions:
210
510
  raise ValueError(f"Unsupported file extension: {source_path.suffix}")
211
511
 
212
512
  if not source_path.exists():
@@ -241,26 +541,33 @@ class HaikuRAG:
241
541
  return await self.document_repository.update(existing_doc)
242
542
  return existing_doc
243
543
 
244
- # Parse file only when content changed or new document
245
- docling_document = FileReader.parse_file(source_path)
544
+ # Convert Chunk Embed using primitives
545
+ docling_document = await self.convert(source_path)
546
+ chunks = await self.chunk(docling_document)
547
+ embedded_chunks = await embed_chunks(chunks, self._config)
246
548
 
247
549
  if existing_doc:
248
- # Update existing document
550
+ # Update existing document and rechunk
249
551
  existing_doc.content = docling_document.export_to_markdown()
250
552
  existing_doc.metadata = metadata
553
+ existing_doc.docling_document_json = docling_document.model_dump_json()
554
+ existing_doc.docling_version = docling_document.version
251
555
  if title is not None:
252
556
  existing_doc.title = title
253
- return await self.document_repository._update_and_rechunk(
254
- existing_doc, docling_document
557
+ return await self._update_document_with_chunks(
558
+ existing_doc, embedded_chunks
255
559
  )
256
560
  else:
257
- # Create new document using DoclingDocument
258
- return await self._create_document_with_docling(
259
- docling_document=docling_document,
561
+ # Create new document
562
+ document = Document(
563
+ content=docling_document.export_to_markdown(),
260
564
  uri=uri,
261
565
  title=title,
262
566
  metadata=metadata,
567
+ docling_document_json=docling_document.model_dump_json(),
568
+ docling_version=docling_document.version,
263
569
  )
570
+ return await self._store_document_with_chunks(document, embedded_chunks)
264
571
 
265
572
  async def _create_or_update_document_from_url(
266
573
  self, url: str, title: str | None = None, metadata: dict | None = None
@@ -283,11 +590,13 @@ class HaikuRAG:
283
590
  ValueError: If the content cannot be parsed
284
591
  httpx.RequestError: If URL request fails
285
592
  """
286
- # Lazy import to avoid loading docling
287
- from haiku.rag.reader import FileReader
593
+ from haiku.rag.embeddings import embed_chunks
288
594
 
289
595
  metadata = metadata or {}
290
596
 
597
+ converter = get_converter(self._config)
598
+ supported_extensions = converter.supported_extensions
599
+
291
600
  async with httpx.AsyncClient() as client:
292
601
  response = await client.get(url)
293
602
  response.raise_for_status()
@@ -320,40 +629,52 @@ class HaikuRAG:
320
629
  url, content_type
321
630
  )
322
631
 
323
- if file_extension not in FileReader.extensions:
632
+ if file_extension not in supported_extensions:
324
633
  raise ValueError(
325
634
  f"Unsupported content type/extension: {content_type}/{file_extension}"
326
635
  )
327
636
 
328
637
  # Create a temporary file with the appropriate extension
329
638
  with tempfile.NamedTemporaryFile(
330
- mode="wb", suffix=file_extension
639
+ mode="wb", suffix=file_extension, delete=False
331
640
  ) as temp_file:
332
641
  temp_file.write(response.content)
333
- temp_file.flush() # Ensure content is written to disk
642
+ temp_file.flush()
334
643
  temp_path = Path(temp_file.name)
335
644
 
336
- # Parse the content using FileReader
337
- docling_document = FileReader.parse_file(temp_path)
645
+ try:
646
+ # Convert → Chunk → Embed using primitives
647
+ docling_document = await self.convert(temp_path)
648
+ chunks = await self.chunk(docling_document)
649
+ embedded_chunks = await embed_chunks(chunks, self._config)
650
+ finally:
651
+ temp_path.unlink(missing_ok=True)
338
652
 
339
653
  # Merge metadata with contentType and md5
340
654
  metadata.update({"contentType": content_type, "md5": md5_hash})
341
655
 
342
656
  if existing_doc:
657
+ # Update existing document and rechunk
343
658
  existing_doc.content = docling_document.export_to_markdown()
344
659
  existing_doc.metadata = metadata
660
+ existing_doc.docling_document_json = docling_document.model_dump_json()
661
+ existing_doc.docling_version = docling_document.version
345
662
  if title is not None:
346
663
  existing_doc.title = title
347
- return await self.document_repository._update_and_rechunk(
348
- existing_doc, docling_document
664
+ return await self._update_document_with_chunks(
665
+ existing_doc, embedded_chunks
349
666
  )
350
667
  else:
351
- return await self._create_document_with_docling(
352
- docling_document=docling_document,
668
+ # Create new document
669
+ document = Document(
670
+ content=docling_document.export_to_markdown(),
353
671
  uri=url,
354
672
  title=title,
355
673
  metadata=metadata,
674
+ docling_document_json=docling_document.model_dump_json(),
675
+ docling_version=docling_document.version,
356
676
  )
677
+ return await self._store_document_with_chunks(document, embedded_chunks)
357
678
 
358
679
  def _get_extension_from_content_type_or_url(
359
680
  self, url: str, content_type: str
@@ -408,17 +729,93 @@ class HaikuRAG:
408
729
  """
409
730
  return await self.document_repository.get_by_uri(uri)
410
731
 
411
- async def update_document(self, document: Document) -> Document:
412
- """Update an existing document."""
413
- # Lazy import to avoid loading docling
414
- from haiku.rag.utils import text_to_docling_document
732
+ async def update_document(
733
+ self,
734
+ document_id: str,
735
+ content: str | None = None,
736
+ metadata: dict | None = None,
737
+ chunks: list[Chunk] | None = None,
738
+ title: str | None = None,
739
+ docling_document: "DoclingDocument | None" = None,
740
+ ) -> Document:
741
+ """Update a document by ID.
415
742
 
416
- # Convert content to DoclingDocument
417
- docling_document = text_to_docling_document(document.content)
743
+ Updates specified fields. When content or docling_document is provided,
744
+ the document is rechunked and re-embedded. Updates to only metadata or title
745
+ skip rechunking for efficiency.
418
746
 
419
- return await self.document_repository._update_and_rechunk(
420
- document, docling_document
421
- )
747
+ Args:
748
+ document_id: The ID of the document to update.
749
+ content: New content (mutually exclusive with docling_document).
750
+ metadata: New metadata dict.
751
+ chunks: Custom chunks (will be embedded if missing embeddings).
752
+ title: New title.
753
+ docling_document: DoclingDocument to replace content (mutually exclusive with content).
754
+
755
+ Returns:
756
+ The updated Document instance.
757
+
758
+ Raises:
759
+ ValueError: If document not found, or if both content and docling_document
760
+ are provided.
761
+ """
762
+ from haiku.rag.embeddings import embed_chunks
763
+
764
+ # Validate: content and docling_document are mutually exclusive
765
+ if content is not None and docling_document is not None:
766
+ raise ValueError(
767
+ "content and docling_document are mutually exclusive. "
768
+ "Provide one or the other, not both."
769
+ )
770
+
771
+ # Fetch the existing document
772
+ existing_doc = await self.get_document_by_id(document_id)
773
+ if existing_doc is None:
774
+ raise ValueError(f"Document with ID {document_id} not found")
775
+
776
+ # Update metadata/title fields
777
+ if title is not None:
778
+ existing_doc.title = title
779
+ if metadata is not None:
780
+ existing_doc.metadata = metadata
781
+
782
+ # Only metadata/title update - no rechunking needed
783
+ if content is None and chunks is None and docling_document is None:
784
+ return await self.document_repository.update(existing_doc)
785
+
786
+ # Custom chunks provided - use them as-is
787
+ if chunks is not None:
788
+ # Store docling data if provided
789
+ if docling_document is not None:
790
+ existing_doc.content = docling_document.export_to_markdown()
791
+ existing_doc.docling_document_json = docling_document.model_dump_json()
792
+ existing_doc.docling_version = docling_document.version
793
+ elif content is not None:
794
+ existing_doc.content = content
795
+
796
+ return await self._update_document_with_chunks(existing_doc, chunks)
797
+
798
+ # DoclingDocument provided without chunks - chunk and embed using primitives
799
+ if docling_document is not None:
800
+ existing_doc.content = docling_document.export_to_markdown()
801
+ existing_doc.docling_document_json = docling_document.model_dump_json()
802
+ existing_doc.docling_version = docling_document.version
803
+
804
+ new_chunks = await self.chunk(docling_document)
805
+ embedded_chunks = await embed_chunks(new_chunks, self._config)
806
+ return await self._update_document_with_chunks(
807
+ existing_doc, embedded_chunks
808
+ )
809
+
810
+ # Content provided without chunks - convert, chunk, and embed using primitives
811
+ existing_doc.content = content # type: ignore[assignment]
812
+ converted_docling = await self.convert(existing_doc.content)
813
+ existing_doc.docling_document_json = converted_docling.model_dump_json()
814
+ existing_doc.docling_version = converted_docling.version
815
+
816
+ new_chunks = await self.chunk(converted_docling)
817
+ embedded_chunks = await embed_chunks(new_chunks, self._config)
818
+ return await self._update_document_with_chunks(existing_doc, embedded_chunks)
422
819
 
423
820
  async def delete_document(self, document_id: str) -> bool:
424
821
  """Delete a document by its ID."""
@@ -447,285 +844,873 @@ class HaikuRAG:
447
844
  async def search(
448
845
  self,
449
846
  query: str,
450
- limit: int = 5,
847
+ limit: int | None = None,
451
848
  search_type: str = "hybrid",
452
849
  filter: str | None = None,
453
- ) -> list[tuple[Chunk, float]]:
850
+ ) -> list[SearchResult]:
454
851
  """Search for relevant chunks using the specified search method with optional reranking.
455
852
 
456
853
  Args:
457
854
  query: The search query string.
458
- limit: Maximum number of results to return.
855
+ limit: Maximum number of results to return. Defaults to config.search.default_limit.
459
856
  search_type: Type of search - "vector", "fts", or "hybrid" (default).
460
857
  filter: Optional SQL WHERE clause to filter documents before searching chunks.
461
858
 
462
859
  Returns:
463
- List of (chunk, score) tuples ordered by relevance.
860
+ List of SearchResult objects ordered by relevance.
464
861
  """
465
- # Get reranker if available
862
+ if limit is None:
863
+ limit = self._config.search.limit
864
+
466
865
  reranker = get_reranker(config=self._config)
467
866
 
468
867
  if reranker is None:
469
- # No reranking - return direct search results
470
- return await self.chunk_repository.search(query, limit, search_type, filter)
471
-
472
- # Get more initial results (3X) for reranking
473
- search_limit = limit * 3
474
- search_results = await self.chunk_repository.search(
475
- query, search_limit, search_type, filter
476
- )
477
-
478
- # Apply reranking
479
- chunks = [chunk for chunk, _ in search_results]
480
- reranked_results = await reranker.rerank(query, chunks, top_n=limit)
868
+ chunk_results = await self.chunk_repository.search(
869
+ query, limit, search_type, filter
870
+ )
871
+ else:
872
+ search_limit = limit * 10
873
+ raw_results = await self.chunk_repository.search(
874
+ query, search_limit, search_type, filter
875
+ )
876
+ chunks = [chunk for chunk, _ in raw_results]
877
+ chunk_results = await reranker.rerank(query, chunks, top_n=limit)
481
878
 
482
- # Return reranked results with scores from reranker
483
- return reranked_results
879
+ return [SearchResult.from_chunk(chunk, score) for chunk, score in chunk_results]
484
880
 
485
881
  async def expand_context(
486
882
  self,
487
- search_results: list[tuple[Chunk, float]],
488
- radius: int | None = None,
489
- ) -> list[tuple[Chunk, float]]:
490
- """Expand search results with adjacent chunks, merging overlapping chunks.
883
+ search_results: list[SearchResult],
884
+ ) -> list[SearchResult]:
885
+ """Expand search results with adjacent content from the source document.
886
+
887
+ When DoclingDocument is available and results have doc_item_refs, expands
888
+ by finding adjacent DocItems with accurate bounding boxes and metadata.
889
+ Otherwise, falls back to chunk-based expansion using adjacent chunks.
890
+
891
+ Expansion is type-aware based on content:
892
+ - Tables, code blocks, and lists expand to include complete structures
893
+ - Text content uses the configured radius (search.context_radius)
894
+ - Expansion is limited by search.max_context_items and search.max_context_chars
491
895
 
492
896
  Args:
493
- search_results: List of (chunk, score) tuples from search.
494
- radius: Number of adjacent chunks to include before/after each chunk.
495
- If None, uses config.processing.context_chunk_radius.
897
+ search_results: List of SearchResult objects from search.
496
898
 
497
899
  Returns:
498
- List of (chunk, score) tuples with expanded and merged context chunks.
900
+ List of SearchResult objects with expanded content and resolved provenance.
499
901
  """
500
- if radius is None:
501
- radius = self._config.processing.context_chunk_radius
502
- if radius == 0:
503
- return search_results
504
-
505
- # Group chunks by document_id to handle merging within documents
506
- document_groups = {}
507
- for chunk, score in search_results:
508
- doc_id = chunk.document_id
902
+ radius = self._config.search.context_radius
903
+ max_items = self._config.search.max_context_items
904
+ max_chars = self._config.search.max_context_chars
905
+
906
+ # Group by document_id for efficient processing
907
+ document_groups: dict[str | None, list[SearchResult]] = {}
908
+ for result in search_results:
909
+ doc_id = result.document_id
509
910
  if doc_id not in document_groups:
510
911
  document_groups[doc_id] = []
511
- document_groups[doc_id].append((chunk, score))
912
+ document_groups[doc_id].append(result)
913
+
914
+ expanded_results = []
915
+
916
+ for doc_id, doc_results in document_groups.items():
917
+ if doc_id is None:
918
+ expanded_results.extend(doc_results)
919
+ continue
920
+
921
+ # Fetch the document to get DoclingDocument
922
+ doc = await self.get_document_by_id(doc_id)
923
+ if doc is None:
924
+ expanded_results.extend(doc_results)
925
+ continue
926
+
927
+ docling_doc = doc.get_docling_document()
928
+
929
+ # Check if we can use DoclingDocument-based expansion
930
+ has_docling = docling_doc is not None
931
+ has_refs = any(r.doc_item_refs for r in doc_results)
932
+
933
+ if has_docling and has_refs:
934
+ # Use DoclingDocument-based expansion
935
+ expanded = await self._expand_with_docling(
936
+ doc_results,
937
+ docling_doc,
938
+ radius,
939
+ max_items,
940
+ max_chars,
941
+ )
942
+ expanded_results.extend(expanded)
943
+ else:
944
+ # Fall back to chunk-based expansion (always uses fixed radius)
945
+ if radius > 0:
946
+ expanded = await self._expand_with_chunks(
947
+ doc_id, doc_results, radius
948
+ )
949
+ expanded_results.extend(expanded)
950
+ else:
951
+ expanded_results.extend(doc_results)
512
952
 
513
- results = []
953
+ return expanded_results
514
954
 
515
- for doc_id, doc_chunks in document_groups.items():
516
- # Get all expanded ranges for this document
517
- expanded_ranges = []
518
- for chunk, score in doc_chunks:
519
- adjacent_chunks = await self.chunk_repository.get_adjacent_chunks(
520
- chunk, radius
521
- )
955
+ def _merge_ranges(
956
+ self, ranges: list[tuple[int, int, SearchResult]]
957
+ ) -> list[tuple[int, int, list[SearchResult]]]:
958
+ """Merge overlapping or adjacent ranges."""
959
+ if not ranges:
960
+ return []
522
961
 
523
- all_chunks = adjacent_chunks + [chunk]
524
-
525
- # Get the range of orders for this expanded chunk
526
- orders = [c.order for c in all_chunks]
527
- min_order = min(orders)
528
- max_order = max(orders)
529
-
530
- expanded_ranges.append(
531
- {
532
- "original_chunk": chunk,
533
- "score": score,
534
- "min_order": min_order,
535
- "max_order": max_order,
536
- "all_chunks": sorted(all_chunks, key=lambda c: c.order),
537
- }
538
- )
962
+ sorted_ranges = sorted(ranges, key=lambda x: x[0])
963
+ merged: list[tuple[int, int, list[SearchResult]]] = []
964
+ cur_min, cur_max, cur_results = (
965
+ sorted_ranges[0][0],
966
+ sorted_ranges[0][1],
967
+ [sorted_ranges[0][2]],
968
+ )
539
969
 
540
- # Merge overlapping/adjacent ranges
541
- merged_ranges = self._merge_overlapping_ranges(expanded_ranges)
970
+ for min_idx, max_idx, result in sorted_ranges[1:]:
971
+ if cur_max >= min_idx - 1: # Overlapping or adjacent
972
+ cur_max = max(cur_max, max_idx)
973
+ cur_results.append(result)
974
+ else:
975
+ merged.append((cur_min, cur_max, cur_results))
976
+ cur_min, cur_max, cur_results = min_idx, max_idx, [result]
542
977
 
543
- # Create merged chunks
544
- for merged_range in merged_ranges:
545
- combined_content_parts = [c.content for c in merged_range["all_chunks"]]
978
+ merged.append((cur_min, cur_max, cur_results))
979
+ return merged
546
980
 
547
- # Use the first original chunk for metadata
548
- original_chunk = merged_range["original_chunks"][0]
981
+ # Label groups for type-aware expansion
982
+ _STRUCTURAL_LABELS = {"table", "code", "list_item", "form", "key_value_region"}
549
983
 
550
- merged_chunk = Chunk(
551
- id=original_chunk.id,
552
- document_id=original_chunk.document_id,
553
- content="".join(combined_content_parts),
554
- metadata=original_chunk.metadata,
555
- document_uri=original_chunk.document_uri,
556
- document_title=original_chunk.document_title,
557
- document_meta=original_chunk.document_meta,
558
- )
984
+ def _extract_item_text(self, item, docling_doc) -> str | None:
985
+ """Extract text content from a DocItem.
559
986
 
560
- # Use the highest score from merged chunks
561
- best_score = max(merged_range["scores"])
562
- results.append((merged_chunk, best_score))
987
+ Handles different item types:
988
+ - TextItem, SectionHeaderItem, etc.: Use .text attribute
989
+ - TableItem: Use export_to_markdown() for table content
990
+ - PictureItem: Use caption if available
991
+ """
992
+ # Try simple text attribute first (works for most items)
993
+ if text := getattr(item, "text", None):
994
+ return text
995
+
996
+ # For tables, export as markdown
997
+ if hasattr(item, "export_to_markdown"):
998
+ try:
999
+ return item.export_to_markdown(docling_doc)
1000
+ except Exception:
1001
+ pass
1002
+
1003
+ # For pictures/charts, try to get caption
1004
+ if caption := getattr(item, "caption", None):
1005
+ if hasattr(caption, "text"):
1006
+ return caption.text
1007
+
1008
+ return None
1009
+
1010
+ def _get_item_label(self, item) -> str | None:
1011
+ """Extract label string from a DocItem."""
1012
+ label = getattr(item, "label", None)
1013
+ if label is None:
1014
+ return None
1015
+ return str(label.value) if hasattr(label, "value") else str(label)
1016
+
1017
+ def _compute_type_aware_range(
1018
+ self,
1019
+ all_items: list,
1020
+ indices: list[int],
1021
+ radius: int,
1022
+ max_items: int,
1023
+ max_chars: int,
1024
+ ) -> tuple[int, int]:
1025
+ """Compute expansion range based on content type with limits.
1026
+
1027
+ For structural content (tables, code, lists), expands to include complete
1028
+ structures. For text, uses the configured radius. Applies hybrid limits.
1029
+ """
1030
+ if not indices:
1031
+ return (0, 0)
1032
+
1033
+ min_idx = min(indices)
1034
+ max_idx = max(indices)
1035
+
1036
+ # Determine the primary label type from matched items
1037
+ labels_in_chunk = set()
1038
+ for idx in indices:
1039
+ item, _ = all_items[idx]
1040
+ if label := self._get_item_label(item):
1041
+ labels_in_chunk.add(label)
1042
+
1043
+ # Check if we have structural content
1044
+ is_structural = bool(labels_in_chunk & self._STRUCTURAL_LABELS)
1045
+
1046
+ if is_structural:
1047
+ # Expand to complete structure boundaries
1048
+ # Expand backwards to find structure start
1049
+ while min_idx > 0:
1050
+ prev_item, _ = all_items[min_idx - 1]
1051
+ prev_label = self._get_item_label(prev_item)
1052
+ if prev_label in labels_in_chunk & self._STRUCTURAL_LABELS:
1053
+ min_idx -= 1
1054
+ else:
1055
+ break
1056
+
1057
+ # Expand forwards to find structure end
1058
+ while max_idx < len(all_items) - 1:
1059
+ next_item, _ = all_items[max_idx + 1]
1060
+ next_label = self._get_item_label(next_item)
1061
+ if next_label in labels_in_chunk & self._STRUCTURAL_LABELS:
1062
+ max_idx += 1
1063
+ else:
1064
+ break
1065
+ else:
1066
+ # Text content: use radius-based expansion
1067
+ min_idx = max(0, min_idx - radius)
1068
+ max_idx = min(len(all_items) - 1, max_idx + radius)
1069
+
1070
+ # Apply hybrid limits
1071
+ # First check item count hard limit
1072
+ if max_idx - min_idx + 1 > max_items:
1073
+ # Center the window around original indices
1074
+ original_center = (min(indices) + max(indices)) // 2
1075
+ half_items = max_items // 2
1076
+ min_idx = max(0, original_center - half_items)
1077
+ max_idx = min(len(all_items) - 1, min_idx + max_items - 1)
1078
+
1079
+ # Then check character soft limit (but keep at least original items)
1080
+ char_count = 0
1081
+ effective_max = min_idx
1082
+ for i in range(min_idx, max_idx + 1):
1083
+ item, _ = all_items[i]
1084
+ text = getattr(item, "text", "") or ""
1085
+ char_count += len(text)
1086
+ effective_max = i
1087
+ # Once we've included original items, check char limit
1088
+ if i >= max(indices) and char_count > max_chars:
1089
+ break
1090
+
1091
+ max_idx = effective_max
1092
+
1093
+ return (min_idx, max_idx)
1094
+
1095
+ async def _expand_with_docling(
1096
+ self,
1097
+ results: list[SearchResult],
1098
+ docling_doc,
1099
+ radius: int,
1100
+ max_items: int,
1101
+ max_chars: int,
1102
+ ) -> list[SearchResult]:
1103
+ """Expand results using DoclingDocument structure.
1104
+
1105
+ Structural content (tables, code, lists) expands to complete structures.
1106
+ Text content uses radius-based expansion.
1107
+ """
1108
+ all_items = list(docling_doc.iterate_items())
1109
+ ref_to_index = {
1110
+ getattr(item, "self_ref", None): i
1111
+ for i, (item, _) in enumerate(all_items)
1112
+ if getattr(item, "self_ref", None)
1113
+ }
563
1114
 
564
- return results
1115
+ # Compute expanded ranges
1116
+ ranges: list[tuple[int, int, SearchResult]] = []
1117
+ passthrough: list[SearchResult] = []
565
1118
 
566
- def _merge_overlapping_ranges(self, expanded_ranges):
567
- """Merge overlapping or adjacent expanded ranges."""
568
- if not expanded_ranges:
569
- return []
1119
+ for result in results:
1120
+ indices = [
1121
+ ref_to_index[r] for r in result.doc_item_refs if r in ref_to_index
1122
+ ]
1123
+ if not indices:
1124
+ passthrough.append(result)
1125
+ continue
570
1126
 
571
- # Sort by min_order
572
- sorted_ranges = sorted(expanded_ranges, key=lambda x: x["min_order"])
573
- merged = []
1127
+ min_idx, max_idx = self._compute_type_aware_range(
1128
+ all_items, indices, radius, max_items, max_chars
1129
+ )
574
1130
 
575
- current = {
576
- "min_order": sorted_ranges[0]["min_order"],
577
- "max_order": sorted_ranges[0]["max_order"],
578
- "original_chunks": [sorted_ranges[0]["original_chunk"]],
579
- "scores": [sorted_ranges[0]["score"]],
580
- "all_chunks": sorted_ranges[0]["all_chunks"],
581
- }
1131
+ ranges.append((min_idx, max_idx, result))
1132
+
1133
+ # Merge overlapping ranges
1134
+ merged = self._merge_ranges(ranges)
1135
+
1136
+ final_results: list[SearchResult] = []
1137
+ for min_idx, max_idx, original_results in merged:
1138
+ content_parts: list[str] = []
1139
+ refs: list[str] = []
1140
+ pages: set[int] = set()
1141
+ labels: set[str] = set()
1142
+
1143
+ for i in range(min_idx, max_idx + 1):
1144
+ item, _ = all_items[i]
1145
+ # Extract text content - handle different item types
1146
+ text = self._extract_item_text(item, docling_doc)
1147
+ if text:
1148
+ content_parts.append(text)
1149
+ if self_ref := getattr(item, "self_ref", None):
1150
+ refs.append(self_ref)
1151
+ if label := getattr(item, "label", None):
1152
+ labels.add(
1153
+ str(label.value) if hasattr(label, "value") else str(label)
1154
+ )
1155
+ if prov := getattr(item, "prov", None):
1156
+ for p in prov:
1157
+ if (page_no := getattr(p, "page_no", None)) is not None:
1158
+ pages.add(page_no)
1159
+
1160
+ # Merge headings preserving order
1161
+ all_headings: list[str] = []
1162
+ for r in original_results:
1163
+ if r.headings:
1164
+ all_headings.extend(h for h in r.headings if h not in all_headings)
1165
+
1166
+ first = original_results[0]
1167
+ final_results.append(
1168
+ SearchResult(
1169
+ content="\n\n".join(content_parts),
1170
+ score=max(r.score for r in original_results),
1171
+ chunk_id=first.chunk_id,
1172
+ document_id=first.document_id,
1173
+ document_uri=first.document_uri,
1174
+ document_title=first.document_title,
1175
+ doc_item_refs=refs,
1176
+ page_numbers=sorted(pages),
1177
+ headings=all_headings or None,
1178
+ labels=sorted(labels),
1179
+ )
1180
+ )
582
1181
 
583
- for range_info in sorted_ranges[1:]:
584
- # Check if ranges overlap or are adjacent (max_order + 1 >= min_order)
585
- if current["max_order"] >= range_info["min_order"] - 1:
586
- # Merge ranges
587
- current["max_order"] = max(
588
- current["max_order"], range_info["max_order"]
1182
+ return final_results + passthrough
1183
+
1184
+ async def _expand_with_chunks(
1185
+ self,
1186
+ doc_id: str,
1187
+ results: list[SearchResult],
1188
+ radius: int,
1189
+ ) -> list[SearchResult]:
1190
+ """Expand results using chunk-based adjacency."""
1191
+ all_chunks = await self.chunk_repository.get_by_document_id(doc_id)
1192
+ if not all_chunks:
1193
+ return results
1194
+
1195
+ content_to_chunk = {c.content: c for c in all_chunks}
1196
+ chunk_by_order = {c.order: c for c in all_chunks}
1197
+ min_order, max_order = min(chunk_by_order.keys()), max(chunk_by_order.keys())
1198
+
1199
+ # Build ranges
1200
+ ranges: list[tuple[int, int, SearchResult]] = []
1201
+ passthrough: list[SearchResult] = []
1202
+
1203
+ for result in results:
1204
+ chunk = content_to_chunk.get(result.content)
1205
+ if chunk is None:
1206
+ passthrough.append(result)
1207
+ continue
1208
+ start = max(min_order, chunk.order - radius)
1209
+ end = min(max_order, chunk.order + radius)
1210
+ ranges.append((start, end, result))
1211
+
1212
+ # Merge and build results
1213
+ final_results: list[SearchResult] = []
1214
+ for min_idx, max_idx, original_results in self._merge_ranges(ranges):
1215
+ # Collect chunks in order
1216
+ chunks_in_range = [
1217
+ chunk_by_order[o]
1218
+ for o in range(min_idx, max_idx + 1)
1219
+ if o in chunk_by_order
1220
+ ]
1221
+ first = original_results[0]
1222
+ final_results.append(
1223
+ SearchResult(
1224
+ content="".join(c.content for c in chunks_in_range),
1225
+ score=max(r.score for r in original_results),
1226
+ chunk_id=first.chunk_id,
1227
+ document_id=first.document_id,
1228
+ document_uri=first.document_uri,
1229
+ document_title=first.document_title,
1230
+ doc_item_refs=first.doc_item_refs,
1231
+ page_numbers=first.page_numbers,
1232
+ headings=first.headings,
1233
+ labels=first.labels,
589
1234
  )
590
- current["original_chunks"].append(range_info["original_chunk"])
591
- current["scores"].append(range_info["score"])
592
-
593
- # Merge all_chunks and deduplicate by order
594
- all_chunks_dict = {}
595
- for chunk in current["all_chunks"] + range_info["all_chunks"]:
596
- order = chunk.order
597
- all_chunks_dict[order] = chunk
598
- current["all_chunks"] = [
599
- all_chunks_dict[order] for order in sorted(all_chunks_dict.keys())
600
- ]
601
- else:
602
- # No overlap, add current to merged and start new
603
- merged.append(current)
604
- current = {
605
- "min_order": range_info["min_order"],
606
- "max_order": range_info["max_order"],
607
- "original_chunks": [range_info["original_chunk"]],
608
- "scores": [range_info["score"]],
609
- "all_chunks": range_info["all_chunks"],
610
- }
611
-
612
- # Add the last range
613
- merged.append(current)
614
- return merged
1235
+ )
1236
+
1237
+ return final_results + passthrough
615
1238
 
616
1239
  async def ask(
617
- self, question: str, cite: bool = False, system_prompt: str | None = None
618
- ) -> str:
1240
+ self,
1241
+ question: str,
1242
+ system_prompt: str | None = None,
1243
+ filter: str | None = None,
1244
+ ) -> "tuple[str, list[Citation]]":
619
1245
  """Ask a question using the configured QA agent.
620
1246
 
621
1247
  Args:
622
1248
  question: The question to ask.
623
- cite: Whether to include citations in the response.
624
1249
  system_prompt: Optional custom system prompt for the QA agent.
1250
+ filter: SQL WHERE clause to filter documents.
625
1251
 
626
1252
  Returns:
627
- The generated answer as a string.
1253
+ Tuple of (answer text, list of resolved citations).
628
1254
  """
629
1255
  from haiku.rag.qa import get_qa_agent
630
1256
 
631
- qa_agent = get_qa_agent(
632
- self, config=self._config, use_citations=cite, system_prompt=system_prompt
633
- )
634
- return await qa_agent.answer(question)
1257
+ qa_agent = get_qa_agent(self, config=self._config, system_prompt=system_prompt)
1258
+ return await qa_agent.answer(question, filter=filter)
635
1259
 
636
- async def rebuild_database(self) -> AsyncGenerator[str, None]:
637
- """Rebuild the database by deleting all chunks and re-indexing all documents.
1260
+ async def visualize_chunk(self, chunk: Chunk) -> list:
1261
+ """Render page images with bounding box highlights for a chunk.
638
1262
 
639
- For documents with URIs:
640
- - Re-adds from source if source exists
641
- - Re-embeds from existing content if source is missing
1263
+ Gets the DoclingDocument from the chunk's document, resolves bounding boxes
1264
+ from chunk metadata, and renders all pages that contain bounding boxes with
1265
+ yellow/orange highlight overlays.
642
1266
 
643
- For documents without URIs:
644
- - Re-creates chunks from existing content
1267
+ Args:
1268
+ chunk: The chunk to visualize.
645
1269
 
646
- Yields:
647
- int: The ID of the document currently being processed
1270
+ Returns:
1271
+ List of PIL Image objects, one per page with bounding boxes.
1272
+ Empty list if no bounding boxes or page images available.
648
1273
  """
649
- # Lazy import to avoid loading docling
650
- from haiku.rag.utils import text_to_docling_document
1274
+ from copy import deepcopy
1275
+
1276
+ from PIL import ImageDraw
1277
+
1278
+ # Get the document
1279
+ if not chunk.document_id:
1280
+ return []
1281
+
1282
+ doc = await self.document_repository.get_by_id(chunk.document_id)
1283
+ if not doc:
1284
+ return []
1285
+
1286
+ # Get DoclingDocument
1287
+ docling_doc = doc.get_docling_document()
1288
+ if not docling_doc:
1289
+ return []
1290
+
1291
+ # Resolve bounding boxes from chunk metadata
1292
+ chunk_meta = chunk.get_chunk_metadata()
1293
+ bounding_boxes = chunk_meta.resolve_bounding_boxes(docling_doc)
1294
+ if not bounding_boxes:
1295
+ return []
1296
+
1297
+ # Group bounding boxes by page
1298
+ boxes_by_page: dict[int, list] = {}
1299
+ for bbox in bounding_boxes:
1300
+ if bbox.page_no not in boxes_by_page:
1301
+ boxes_by_page[bbox.page_no] = []
1302
+ boxes_by_page[bbox.page_no].append(bbox)
1303
+
1304
+ # Render each page with its bounding boxes
1305
+ images = []
1306
+ for page_no in sorted(boxes_by_page.keys()):
1307
+ if page_no not in docling_doc.pages:
1308
+ continue
1309
+
1310
+ page = docling_doc.pages[page_no]
1311
+ if page.image is None or page.image.pil_image is None:
1312
+ continue
1313
+
1314
+ pil_image = page.image.pil_image
1315
+ page_height = page.size.height
1316
+
1317
+ # Calculate scale factor (image pixels vs document coordinates)
1318
+ scale_x = pil_image.width / page.size.width
1319
+ scale_y = pil_image.height / page.size.height
651
1320
 
652
- await self.chunk_repository.delete_all()
653
- self.store.recreate_embeddings_table()
1321
+ # Draw bounding boxes
1322
+ image = deepcopy(pil_image)
1323
+ draw = ImageDraw.Draw(image, "RGBA")
654
1324
 
1325
+ for bbox in boxes_by_page[page_no]:
1326
+ # Convert from document coordinates to image coordinates
1327
+ # Document coords are bottom-left origin, PIL uses top-left
1328
+ x0 = bbox.left * scale_x
1329
+ y0 = (page_height - bbox.top) * scale_y
1330
+ x1 = bbox.right * scale_x
1331
+ y1 = (page_height - bbox.bottom) * scale_y
1332
+
1333
+ # Ensure proper ordering (y0 should be less than y1 for PIL)
1334
+ if y0 > y1:
1335
+ y0, y1 = y1, y0
1336
+
1337
+ # Draw filled rectangle with transparency
1338
+ fill_color = (255, 255, 0, 80) # Yellow with transparency
1339
+ outline_color = (255, 165, 0, 255) # Orange outline
1340
+
1341
+ draw.rectangle([(x0, y0), (x1, y1)], fill=fill_color, outline=None)
1342
+ draw.rectangle([(x0, y0), (x1, y1)], outline=outline_color, width=3)
1343
+
1344
+ images.append(image)
1345
+
1346
+ return images
1347
+
1348
+ async def rebuild_database(
1349
+ self, mode: RebuildMode = RebuildMode.FULL
1350
+ ) -> AsyncGenerator[str, None]:
1351
+ """Rebuild the database with the specified mode.
1352
+
1353
+ Args:
1354
+ mode: The rebuild mode to use:
1355
+ - FULL: Re-convert from source files, re-chunk, re-embed (default)
1356
+ - RECHUNK: Re-chunk from existing content, re-embed (no source access)
1357
+ - EMBED_ONLY: Keep existing chunks, only regenerate embeddings
1358
+
1359
+ Yields:
1360
+ The ID of the document currently being processed.
1361
+ """
655
1362
  # Update settings to current config
656
1363
  settings_repo = SettingsRepository(self.store)
657
1364
  settings_repo.save_current_settings()
658
1365
 
659
1366
  documents = await self.list_documents()
660
1367
 
1368
+ if mode == RebuildMode.EMBED_ONLY:
1369
+ async for doc_id in self._rebuild_embed_only(documents):
1370
+ yield doc_id
1371
+ elif mode == RebuildMode.RECHUNK:
1372
+ await self.chunk_repository.delete_all()
1373
+ self.store.recreate_embeddings_table()
1374
+ async for doc_id in self._rebuild_rechunk(documents):
1375
+ yield doc_id
1376
+ else: # FULL
1377
+ await self.chunk_repository.delete_all()
1378
+ self.store.recreate_embeddings_table()
1379
+ async for doc_id in self._rebuild_full(documents):
1380
+ yield doc_id
1381
+
1382
+ # Final maintenance if auto_vacuum enabled
1383
+ if self._config.storage.auto_vacuum:
1384
+ try:
1385
+ await self.store.vacuum()
1386
+ except Exception:
1387
+ pass
1388
+
1389
+ async def _rebuild_embed_only(
1390
+ self, documents: list[Document]
1391
+ ) -> AsyncGenerator[str, None]:
1392
+ """Re-embed all chunks without changing chunk boundaries."""
1393
+ from haiku.rag.embeddings import contextualize
1394
+
1395
+ batch_size = 50
1396
+ pending_records: list = []
1397
+ pending_doc_ids: list[str] = []
1398
+
661
1399
  for doc in documents:
662
- assert doc.id is not None, "Document ID should not be None"
663
- if doc.uri:
664
- # Document has a URI - check if source is accessible
665
- source_accessible = False
666
- parsed_url = urlparse(doc.uri)
1400
+ assert doc.id is not None
1401
+
1402
+ # Get existing chunks
1403
+ chunks = await self.chunk_repository.get_by_document_id(doc.id)
1404
+ if not chunks:
1405
+ yield doc.id
1406
+ continue
1407
+
1408
+ # Generate new embeddings using contextualize for consistency
1409
+ texts = contextualize(chunks)
1410
+ embeddings = await self.chunk_repository.embedder.embed_documents(texts)
1411
+
1412
+ # Build updated records
1413
+ for chunk, content_fts, embedding in zip(chunks, texts, embeddings):
1414
+ pending_records.append(
1415
+ self.store.ChunkRecord(
1416
+ id=chunk.id, # type: ignore[arg-type]
1417
+ document_id=chunk.document_id, # type: ignore[arg-type]
1418
+ content=chunk.content,
1419
+ content_fts=content_fts,
1420
+ metadata=json.dumps(chunk.metadata),
1421
+ order=chunk.order,
1422
+ vector=embedding,
1423
+ )
1424
+ )
1425
+
1426
+ pending_doc_ids.append(doc.id)
1427
+
1428
+ # Flush batch when size reached
1429
+ if len(pending_doc_ids) >= batch_size:
1430
+ if pending_records:
1431
+ self.store.chunks_table.merge_insert(
1432
+ "id"
1433
+ ).when_matched_update_all().execute(pending_records)
1434
+ for doc_id in pending_doc_ids:
1435
+ yield doc_id
1436
+ pending_records = []
1437
+ pending_doc_ids = []
1438
+
1439
+ # Flush remaining
1440
+ if pending_records:
1441
+ self.store.chunks_table.merge_insert(
1442
+ "id"
1443
+ ).when_matched_update_all().execute(pending_records)
1444
+ for doc_id in pending_doc_ids:
1445
+ yield doc_id
1446
+
1447
+ async def _flush_rebuild_batch(
1448
+ self, documents: list[Document], chunks: list[Chunk]
1449
+ ) -> None:
1450
+ """Batch write documents and chunks during rebuild.
1451
+
1452
+ This performs two writes: one for all document updates, one for all chunks.
1453
+ Used by RECHUNK and FULL modes after the chunks table has been cleared.
1454
+ """
1455
+ from haiku.rag.store.engine import DocumentRecord
1456
+ from haiku.rag.store.models.document import invalidate_docling_document_cache
1457
+
1458
+ if not documents:
1459
+ return
1460
+
1461
+ now = datetime.now().isoformat()
1462
+
1463
+ # Invalidate cache for all documents being updated
1464
+ for doc in documents:
1465
+ if doc.id:
1466
+ invalidate_docling_document_cache(doc.id)
1467
+
1468
+ # Batch update documents using merge_insert (single LanceDB version)
1469
+ doc_records = [
1470
+ DocumentRecord(
1471
+ id=doc.id, # type: ignore[arg-type]
1472
+ content=doc.content,
1473
+ uri=doc.uri,
1474
+ title=doc.title,
1475
+ metadata=json.dumps(doc.metadata),
1476
+ docling_document_json=doc.docling_document_json,
1477
+ docling_version=doc.docling_version,
1478
+ created_at=doc.created_at.isoformat() if doc.created_at else now,
1479
+ updated_at=now,
1480
+ )
1481
+ for doc in documents
1482
+ ]
1483
+
1484
+ self.store.documents_table.merge_insert("id").when_matched_update_all().execute(
1485
+ doc_records
1486
+ )
1487
+
1488
+ # Batch create all chunks (single LanceDB version)
1489
+ if chunks:
1490
+ await self.chunk_repository.create(chunks)
1491
+
1492
+ async def _rebuild_rechunk(
1493
+ self, documents: list[Document]
1494
+ ) -> AsyncGenerator[str, None]:
1495
+ """Re-chunk and re-embed from existing document content."""
1496
+ from haiku.rag.embeddings import embed_chunks
1497
+
1498
+ batch_size = 50
1499
+ pending_chunks: list[Chunk] = []
1500
+ pending_docs: list[Document] = []
1501
+ pending_doc_ids: list[str] = []
1502
+
1503
+ for doc in documents:
1504
+ assert doc.id is not None
1505
+
1506
+ # Convert content to DoclingDocument
1507
+ docling_document = await self.convert(doc.content)
1508
+
1509
+ # Chunk and embed
1510
+ chunks = await self.chunk(docling_document)
1511
+ embedded_chunks = await embed_chunks(chunks, self._config)
1512
+
1513
+ # Update document fields
1514
+ doc.docling_document_json = docling_document.model_dump_json()
1515
+ doc.docling_version = docling_document.version
1516
+
1517
+ # Prepare chunks with document_id and order
1518
+ for order, chunk in enumerate(embedded_chunks):
1519
+ chunk.document_id = doc.id
1520
+ chunk.order = order
1521
+
1522
+ pending_chunks.extend(embedded_chunks)
1523
+ pending_docs.append(doc)
1524
+ pending_doc_ids.append(doc.id)
1525
+
1526
+ # Flush batch when size reached
1527
+ if len(pending_docs) >= batch_size:
1528
+ await self._flush_rebuild_batch(pending_docs, pending_chunks)
1529
+ for doc_id in pending_doc_ids:
1530
+ yield doc_id
1531
+ pending_chunks = []
1532
+ pending_docs = []
1533
+ pending_doc_ids = []
1534
+
1535
+ # Flush remaining
1536
+ if pending_docs:
1537
+ await self._flush_rebuild_batch(pending_docs, pending_chunks)
1538
+ for doc_id in pending_doc_ids:
1539
+ yield doc_id
1540
+
1541
+ async def _rebuild_full(
1542
+ self, documents: list[Document]
1543
+ ) -> AsyncGenerator[str, None]:
1544
+ """Full rebuild: re-convert from source, re-chunk, re-embed."""
1545
+ from haiku.rag.embeddings import embed_chunks
1546
+
1547
+ batch_size = 50
1548
+ pending_chunks: list[Chunk] = []
1549
+ pending_docs: list[Document] = []
1550
+ pending_doc_ids: list[str] = []
667
1551
 
1552
+ for doc in documents:
1553
+ assert doc.id is not None
1554
+
1555
+ # Try to rebuild from source if available
1556
+ if doc.uri and self._check_source_accessible(doc.uri):
668
1557
  try:
669
- if parsed_url.scheme == "file":
670
- # Check if file exists
671
- source_path = Path(parsed_url.path)
672
- source_accessible = source_path.exists()
673
- elif parsed_url.scheme in ("http", "https"):
674
- # For URLs, we'll try to create and catch errors
675
- source_accessible = True
676
- else:
677
- source_accessible = False
678
- except Exception:
679
- source_accessible = False
680
-
681
- if source_accessible:
682
- # Source exists - delete and recreate from source
683
- try:
684
- await self.delete_document(doc.id)
685
- new_doc = await self.create_document_from_source(
686
- source=doc.uri, metadata=doc.metadata or {}
687
- )
688
- # URIs always point to single files/URLs, never directories
689
- assert isinstance(new_doc, Document)
690
- assert new_doc.id is not None, (
691
- "New document ID should not be None"
692
- )
693
- yield new_doc.id
694
- except Exception as e:
695
- logger.error(
696
- "Error recreating document from source %s: %s",
697
- doc.uri,
698
- e,
699
- )
700
- continue
701
- else:
702
- # Source missing - re-embed from existing content
703
- logger.warning(
704
- "Source missing for %s, re-embedding from content", doc.uri
1558
+ # Flush pending batch before source rebuild (creates new doc)
1559
+ if pending_docs:
1560
+ await self._flush_rebuild_batch(pending_docs, pending_chunks)
1561
+ for doc_id in pending_doc_ids:
1562
+ yield doc_id
1563
+ pending_chunks = []
1564
+ pending_docs = []
1565
+ pending_doc_ids = []
1566
+
1567
+ await self.delete_document(doc.id)
1568
+ new_doc = await self.create_document_from_source(
1569
+ source=doc.uri, metadata=doc.metadata or {}
705
1570
  )
706
- docling_document = text_to_docling_document(doc.content)
707
- await self.chunk_repository.create_chunks_for_document(
708
- doc.id, docling_document
1571
+ assert isinstance(new_doc, Document)
1572
+ assert new_doc.id is not None
1573
+ yield new_doc.id
1574
+ continue
1575
+ except Exception as e:
1576
+ logger.error(
1577
+ "Error recreating document from source %s: %s",
1578
+ doc.uri,
1579
+ e,
709
1580
  )
710
- yield doc.id
711
- else:
712
- # Document without URI - re-create chunks from existing content
713
- docling_document = text_to_docling_document(doc.content)
714
- await self.chunk_repository.create_chunks_for_document(
715
- doc.id, docling_document
1581
+ continue
1582
+
1583
+ # Fallback: rebuild from stored content
1584
+ if doc.uri:
1585
+ logger.warning(
1586
+ "Source missing for %s, re-embedding from content", doc.uri
716
1587
  )
717
- yield doc.id
718
1588
 
719
- # Final maintenance: centralized vacuum to curb disk usage
1589
+ docling_document = await self.convert(doc.content)
1590
+ chunks = await self.chunk(docling_document)
1591
+ embedded_chunks = await embed_chunks(chunks, self._config)
1592
+
1593
+ doc.docling_document_json = docling_document.model_dump_json()
1594
+ doc.docling_version = docling_document.version
1595
+
1596
+ # Prepare chunks with document_id and order
1597
+ for order, chunk in enumerate(embedded_chunks):
1598
+ chunk.document_id = doc.id
1599
+ chunk.order = order
1600
+
1601
+ pending_chunks.extend(embedded_chunks)
1602
+ pending_docs.append(doc)
1603
+ pending_doc_ids.append(doc.id)
1604
+
1605
+ # Flush batch when size reached
1606
+ if len(pending_docs) >= batch_size:
1607
+ await self._flush_rebuild_batch(pending_docs, pending_chunks)
1608
+ for doc_id in pending_doc_ids:
1609
+ yield doc_id
1610
+ pending_chunks = []
1611
+ pending_docs = []
1612
+ pending_doc_ids = []
1613
+
1614
+ # Flush remaining
1615
+ if pending_docs:
1616
+ await self._flush_rebuild_batch(pending_docs, pending_chunks)
1617
+ for doc_id in pending_doc_ids:
1618
+ yield doc_id
1619
+
1620
+ def _check_source_accessible(self, uri: str) -> bool:
1621
+ """Check if a document's source URI is accessible."""
1622
+ parsed_url = urlparse(uri)
720
1623
  try:
721
- await self.store.vacuum()
1624
+ if parsed_url.scheme == "file":
1625
+ return Path(parsed_url.path).exists()
1626
+ elif parsed_url.scheme in ("http", "https"):
1627
+ return True
1628
+ return False
722
1629
  except Exception:
723
- pass
1630
+ return False
724
1631
 
725
1632
  async def vacuum(self) -> None:
726
1633
  """Optimize and clean up old versions across all tables."""
727
1634
  await self.store.vacuum()
728
1635
 
1636
+ async def download_models(self) -> AsyncGenerator[DownloadProgress, None]:
1637
+ """Download required models, yielding progress events.
1638
+
1639
+ Yields DownloadProgress events for:
1640
+ - Docling models (status="docling_start", "docling_done")
1641
+ - HuggingFace tokenizer (status="tokenizer_start", "tokenizer_done")
1642
+ - Ollama models (status="pulling", "downloading", "done", or other Ollama statuses)
1643
+ """
1644
+ # Docling models
1645
+ try:
1646
+ from docling.utils.model_downloader import download_models
1647
+
1648
+ yield DownloadProgress(model="docling", status="start")
1649
+ await asyncio.to_thread(download_models)
1650
+ yield DownloadProgress(model="docling", status="done")
1651
+ except ImportError:
1652
+ pass
1653
+
1654
+ # HuggingFace tokenizer
1655
+ from transformers import AutoTokenizer
1656
+
1657
+ tokenizer_name = self._config.processing.chunking_tokenizer
1658
+ yield DownloadProgress(model=tokenizer_name, status="start")
1659
+ await asyncio.to_thread(AutoTokenizer.from_pretrained, tokenizer_name)
1660
+ yield DownloadProgress(model=tokenizer_name, status="done")
1661
+
1662
+ # Collect Ollama models from config
1663
+ required_models: set[str] = set()
1664
+ if self._config.embeddings.model.provider == "ollama":
1665
+ required_models.add(self._config.embeddings.model.name)
1666
+ if self._config.qa.model.provider == "ollama":
1667
+ required_models.add(self._config.qa.model.name)
1668
+ if self._config.research.model.provider == "ollama":
1669
+ required_models.add(self._config.research.model.name)
1670
+ if (
1671
+ self._config.reranking.model
1672
+ and self._config.reranking.model.provider == "ollama"
1673
+ ):
1674
+ required_models.add(self._config.reranking.model.name)
1675
+ pic_desc = self._config.processing.conversion_options.picture_description
1676
+ if pic_desc.enabled and pic_desc.model.provider == "ollama":
1677
+ required_models.add(pic_desc.model.name)
1678
+
1679
+ if not required_models:
1680
+ return
1681
+
1682
+ base_url = self._config.providers.ollama.base_url
1683
+
1684
+ async with httpx.AsyncClient(timeout=None) as client:
1685
+ for model in sorted(required_models):
1686
+ yield DownloadProgress(model=model, status="pulling")
1687
+
1688
+ async with client.stream(
1689
+ "POST", f"{base_url}/api/pull", json={"model": model}
1690
+ ) as r:
1691
+ async for line in r.aiter_lines():
1692
+ if not line:
1693
+ continue
1694
+ try:
1695
+ data = json.loads(line)
1696
+ status = data.get("status", "")
1697
+ digest = data.get("digest", "")
1698
+
1699
+ if digest and "total" in data:
1700
+ yield DownloadProgress(
1701
+ model=model,
1702
+ status="downloading",
1703
+ total=data.get("total", 0),
1704
+ completed=data.get("completed", 0),
1705
+ digest=digest,
1706
+ )
1707
+ elif status:
1708
+ yield DownloadProgress(model=model, status=status)
1709
+ except json.JSONDecodeError:
1710
+ pass
1711
+
1712
+ yield DownloadProgress(model=model, status="done")
1713
+
729
1714
  def close(self):
730
1715
  """Close the underlying store connection."""
731
1716
  self.store.close()