haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
from .chunk import Chunk
|
|
1
|
+
from .chunk import BoundingBox, Chunk, ChunkMetadata, SearchResult
|
|
2
2
|
from .document import Document
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"BoundingBox",
|
|
6
|
+
"Chunk",
|
|
7
|
+
"ChunkMetadata",
|
|
8
|
+
"Document",
|
|
9
|
+
"SearchResult",
|
|
10
|
+
]
|
haiku/rag/store/models/chunk.py
CHANGED
|
@@ -1,5 +1,92 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
1
3
|
from pydantic import BaseModel
|
|
2
4
|
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from docling_core.types.doc.document import DocItem, DoclingDocument
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BoundingBox(BaseModel):
|
|
10
|
+
"""Bounding box coordinates for visual grounding."""
|
|
11
|
+
|
|
12
|
+
page_no: int
|
|
13
|
+
left: float
|
|
14
|
+
top: float
|
|
15
|
+
right: float
|
|
16
|
+
bottom: float
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChunkMetadata(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Structured metadata for a chunk, including DoclingDocument references.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
doc_item_refs: JSON pointer references to DocItems in the parent DoclingDocument
|
|
25
|
+
(e.g., ["#/texts/5", "#/texts/6", "#/tables/0"])
|
|
26
|
+
headings: Section heading hierarchy for this chunk
|
|
27
|
+
(e.g., ["Chapter 1", "Section 1.1"])
|
|
28
|
+
labels: Semantic labels for each doc_item (e.g., ["paragraph", "table"])
|
|
29
|
+
page_numbers: Page numbers where the chunk content appears
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
doc_item_refs: list[str] = []
|
|
33
|
+
headings: list[str] | None = None
|
|
34
|
+
labels: list[str] = []
|
|
35
|
+
page_numbers: list[int] = []
|
|
36
|
+
|
|
37
|
+
def resolve_doc_items(self, docling_document: "DoclingDocument") -> list["DocItem"]:
|
|
38
|
+
"""Resolve doc_item_refs to actual DocItem objects.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
docling_document: The parent DoclingDocument containing the items.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of resolved DocItem objects. Items that fail to resolve are skipped.
|
|
45
|
+
"""
|
|
46
|
+
from docling_core.types.doc.document import RefItem
|
|
47
|
+
|
|
48
|
+
doc_items = []
|
|
49
|
+
for ref in self.doc_item_refs:
|
|
50
|
+
try:
|
|
51
|
+
ref_item = RefItem.model_validate({"$ref": ref})
|
|
52
|
+
doc_item = ref_item.resolve(docling_document)
|
|
53
|
+
doc_items.append(doc_item)
|
|
54
|
+
except Exception:
|
|
55
|
+
# Graceful degradation: skip refs that can't be resolved
|
|
56
|
+
continue
|
|
57
|
+
return doc_items
|
|
58
|
+
|
|
59
|
+
def resolve_bounding_boxes(
|
|
60
|
+
self, docling_document: "DoclingDocument"
|
|
61
|
+
) -> list[BoundingBox]:
|
|
62
|
+
"""Resolve doc_item_refs to bounding boxes for visual grounding.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
docling_document: The parent DoclingDocument containing the items.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of BoundingBox objects from resolved DocItems' provenance.
|
|
69
|
+
"""
|
|
70
|
+
bounding_boxes = []
|
|
71
|
+
for doc_item in self.resolve_doc_items(docling_document):
|
|
72
|
+
prov = getattr(doc_item, "prov", None)
|
|
73
|
+
if not prov:
|
|
74
|
+
continue
|
|
75
|
+
for prov_item in prov:
|
|
76
|
+
bbox = getattr(prov_item, "bbox", None)
|
|
77
|
+
if bbox is None:
|
|
78
|
+
continue
|
|
79
|
+
bounding_boxes.append(
|
|
80
|
+
BoundingBox(
|
|
81
|
+
page_no=prov_item.page_no,
|
|
82
|
+
left=bbox.l,
|
|
83
|
+
top=bbox.t,
|
|
84
|
+
right=bbox.r,
|
|
85
|
+
bottom=bbox.b,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
return bounding_boxes
|
|
89
|
+
|
|
3
90
|
|
|
4
91
|
class Chunk(BaseModel):
|
|
5
92
|
"""
|
|
@@ -15,3 +102,106 @@ class Chunk(BaseModel):
|
|
|
15
102
|
document_title: str | None = None
|
|
16
103
|
document_meta: dict = {}
|
|
17
104
|
embedding: list[float] | None = None
|
|
105
|
+
|
|
106
|
+
def get_chunk_metadata(self) -> ChunkMetadata:
|
|
107
|
+
"""Parse metadata dict into structured ChunkMetadata."""
|
|
108
|
+
return ChunkMetadata.model_validate(self.metadata)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class SearchResult(BaseModel):
|
|
112
|
+
"""Search result with optional provenance information for citations."""
|
|
113
|
+
|
|
114
|
+
content: str
|
|
115
|
+
score: float
|
|
116
|
+
chunk_id: str | None = None
|
|
117
|
+
document_id: str | None = None
|
|
118
|
+
document_uri: str | None = None
|
|
119
|
+
document_title: str | None = None
|
|
120
|
+
doc_item_refs: list[str] = []
|
|
121
|
+
page_numbers: list[int] = []
|
|
122
|
+
headings: list[str] | None = None
|
|
123
|
+
labels: list[str] = []
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def from_chunk(
|
|
127
|
+
cls,
|
|
128
|
+
chunk: "Chunk",
|
|
129
|
+
score: float,
|
|
130
|
+
) -> "SearchResult":
|
|
131
|
+
"""Create from a Chunk."""
|
|
132
|
+
meta = chunk.get_chunk_metadata()
|
|
133
|
+
return cls(
|
|
134
|
+
content=chunk.content,
|
|
135
|
+
score=score,
|
|
136
|
+
chunk_id=chunk.id,
|
|
137
|
+
document_id=chunk.document_id,
|
|
138
|
+
document_uri=chunk.document_uri,
|
|
139
|
+
document_title=chunk.document_title,
|
|
140
|
+
doc_item_refs=meta.doc_item_refs,
|
|
141
|
+
page_numbers=meta.page_numbers,
|
|
142
|
+
headings=meta.headings,
|
|
143
|
+
labels=meta.labels,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def format_for_agent(self) -> str:
|
|
147
|
+
"""Format this search result for inclusion in agent context.
|
|
148
|
+
|
|
149
|
+
Produces a structured format with metadata that helps LLMs understand
|
|
150
|
+
the source and nature of the content.
|
|
151
|
+
"""
|
|
152
|
+
parts = [f"[{self.chunk_id}] (score: {self.score:.2f})"]
|
|
153
|
+
|
|
154
|
+
# Document source info
|
|
155
|
+
source_parts = []
|
|
156
|
+
if self.document_title:
|
|
157
|
+
source_parts.append(f'"{self.document_title}"')
|
|
158
|
+
if self.headings:
|
|
159
|
+
source_parts.append(" > ".join(self.headings))
|
|
160
|
+
if source_parts:
|
|
161
|
+
parts.append(f"Source: {' > '.join(source_parts)}")
|
|
162
|
+
|
|
163
|
+
# Content type (use primary label if available)
|
|
164
|
+
if self.labels:
|
|
165
|
+
primary_label = self._get_primary_label()
|
|
166
|
+
if primary_label:
|
|
167
|
+
parts.append(f"Type: {primary_label}")
|
|
168
|
+
|
|
169
|
+
# The actual content
|
|
170
|
+
parts.append(f"Content:\n{self.content}")
|
|
171
|
+
|
|
172
|
+
return "\n".join(parts)
|
|
173
|
+
|
|
174
|
+
def _get_primary_label(self) -> str | None:
|
|
175
|
+
"""Get the most significant label for display.
|
|
176
|
+
|
|
177
|
+
Prioritizes structural labels over text labels.
|
|
178
|
+
"""
|
|
179
|
+
if not self.labels:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
# Priority order: structural > contextual > text
|
|
183
|
+
priority = {
|
|
184
|
+
"table": 1,
|
|
185
|
+
"code": 2,
|
|
186
|
+
"form": 3,
|
|
187
|
+
"key_value_region": 4,
|
|
188
|
+
"list_item": 5,
|
|
189
|
+
"formula": 6,
|
|
190
|
+
"chart": 7,
|
|
191
|
+
"picture": 8,
|
|
192
|
+
"caption": 9,
|
|
193
|
+
"footnote": 10,
|
|
194
|
+
"section_header": 11,
|
|
195
|
+
"title": 12,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# Find highest priority label
|
|
199
|
+
best_label = None
|
|
200
|
+
best_priority = float("inf")
|
|
201
|
+
for label in self.labels:
|
|
202
|
+
if label in priority and priority[label] < best_priority:
|
|
203
|
+
best_label = label
|
|
204
|
+
best_priority = priority[label]
|
|
205
|
+
|
|
206
|
+
# Return best structural/special label, or first label if all are text
|
|
207
|
+
return best_label if best_label else self.labels[0]
|
|
@@ -1,7 +1,32 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
4
|
+
from cachetools import LRUCache
|
|
3
5
|
from pydantic import BaseModel, Field
|
|
4
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_docling_document_cache: LRUCache[str, "DoclingDocument"] = LRUCache(maxsize=100)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_cached_docling_document(document_id: str, json_str: str) -> "DoclingDocument":
|
|
15
|
+
"""Get or parse DoclingDocument with LRU caching by document ID."""
|
|
16
|
+
if document_id in _docling_document_cache:
|
|
17
|
+
return _docling_document_cache[document_id]
|
|
18
|
+
|
|
19
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
20
|
+
|
|
21
|
+
doc = DoclingDocument.model_validate_json(json_str)
|
|
22
|
+
_docling_document_cache[document_id] = doc
|
|
23
|
+
return doc
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def invalidate_docling_document_cache(document_id: str) -> None:
|
|
27
|
+
"""Remove a document from the DoclingDocument cache."""
|
|
28
|
+
_docling_document_cache.pop(document_id, None)
|
|
29
|
+
|
|
5
30
|
|
|
6
31
|
class Document(BaseModel):
|
|
7
32
|
"""
|
|
@@ -13,5 +38,26 @@ class Document(BaseModel):
|
|
|
13
38
|
uri: str | None = None
|
|
14
39
|
title: str | None = None
|
|
15
40
|
metadata: dict = {}
|
|
41
|
+
docling_document_json: str | None = None
|
|
42
|
+
docling_version: str | None = None
|
|
16
43
|
created_at: datetime = Field(default_factory=datetime.now)
|
|
17
44
|
updated_at: datetime = Field(default_factory=datetime.now)
|
|
45
|
+
|
|
46
|
+
def get_docling_document(self) -> "DoclingDocument | None":
|
|
47
|
+
"""Parse and return the stored DoclingDocument.
|
|
48
|
+
|
|
49
|
+
Uses LRU cache (keyed by document ID) to avoid repeated parsing.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
The parsed DoclingDocument, or None if not stored or no ID.
|
|
53
|
+
"""
|
|
54
|
+
if self.docling_document_json is None:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
# No caching for documents without ID
|
|
58
|
+
if self.id is None:
|
|
59
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
60
|
+
|
|
61
|
+
return DoclingDocument.model_validate_json(self.docling_document_json)
|
|
62
|
+
|
|
63
|
+
return _get_cached_docling_document(self.id, self.docling_document_json)
|
|
@@ -1,21 +1,20 @@
|
|
|
1
|
-
import inspect
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING, cast
|
|
5
4
|
from uuid import uuid4
|
|
6
5
|
|
|
7
6
|
if TYPE_CHECKING:
|
|
8
7
|
import pandas as pd
|
|
9
|
-
from lancedb.query import
|
|
8
|
+
from lancedb.query import (
|
|
9
|
+
LanceHybridQueryBuilder,
|
|
10
|
+
LanceQueryBuilder,
|
|
11
|
+
LanceVectorQueryBuilder,
|
|
12
|
+
)
|
|
10
13
|
|
|
11
14
|
from lancedb.rerankers import RRFReranker
|
|
12
15
|
|
|
13
16
|
from haiku.rag.store.engine import DocumentRecord, Store
|
|
14
17
|
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
-
from haiku.rag.utils import load_callable
|
|
16
|
-
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
from docling_core.types.doc.document import DoclingDocument
|
|
19
18
|
|
|
20
19
|
logger = logging.getLogger(__name__)
|
|
21
20
|
|
|
@@ -28,43 +27,87 @@ class ChunkRepository:
|
|
|
28
27
|
self.embedder = store.embedder
|
|
29
28
|
|
|
30
29
|
def _ensure_fts_index(self) -> None:
|
|
31
|
-
"""Ensure FTS index exists on the
|
|
30
|
+
"""Ensure FTS index exists on the content_fts column."""
|
|
32
31
|
try:
|
|
33
32
|
self.store.chunks_table.create_fts_index(
|
|
34
|
-
"
|
|
33
|
+
"content_fts", replace=True, with_position=True, remove_stop_words=False
|
|
35
34
|
)
|
|
36
35
|
except Exception as e:
|
|
37
36
|
# Log the error but don't fail - FTS might already exist
|
|
38
37
|
logger.debug(f"FTS index creation skipped: {e}")
|
|
39
38
|
|
|
40
|
-
|
|
41
|
-
"""
|
|
42
|
-
|
|
39
|
+
def _contextualize_content(self, chunk: Chunk) -> str:
|
|
40
|
+
"""Generate contextualized content for FTS by prepending headings."""
|
|
41
|
+
meta = chunk.get_chunk_metadata()
|
|
42
|
+
if meta.headings:
|
|
43
|
+
return "\n".join(meta.headings) + "\n" + chunk.content
|
|
44
|
+
return chunk.content
|
|
43
45
|
|
|
44
|
-
|
|
46
|
+
async def create(self, entity: Chunk | list[Chunk]) -> Chunk | list[Chunk]:
|
|
47
|
+
"""Create one or more chunks in the database.
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
id=chunk_id,
|
|
55
|
-
document_id=entity.document_id,
|
|
56
|
-
content=entity.content,
|
|
57
|
-
metadata=json.dumps(
|
|
58
|
-
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
59
|
-
),
|
|
60
|
-
order=order_val,
|
|
61
|
-
vector=embedding,
|
|
62
|
-
)
|
|
49
|
+
Chunks must have embeddings set before calling this method.
|
|
50
|
+
Use client._ensure_chunks_embedded() to embed chunks if needed.
|
|
51
|
+
"""
|
|
52
|
+
self.store._assert_writable()
|
|
53
|
+
# Handle single chunk
|
|
54
|
+
if isinstance(entity, Chunk):
|
|
55
|
+
assert entity.document_id, "Chunk must have a document_id to be created"
|
|
56
|
+
assert entity.embedding is not None, "Chunk must have an embedding"
|
|
63
57
|
|
|
64
|
-
|
|
58
|
+
chunk_id = str(uuid4())
|
|
65
59
|
|
|
66
|
-
|
|
67
|
-
|
|
60
|
+
chunk_record = self.store.ChunkRecord(
|
|
61
|
+
id=chunk_id,
|
|
62
|
+
document_id=entity.document_id,
|
|
63
|
+
content=entity.content,
|
|
64
|
+
content_fts=self._contextualize_content(entity),
|
|
65
|
+
metadata=json.dumps(
|
|
66
|
+
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
67
|
+
),
|
|
68
|
+
order=int(entity.order),
|
|
69
|
+
vector=entity.embedding,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.store.chunks_table.add([chunk_record])
|
|
73
|
+
|
|
74
|
+
entity.id = chunk_id
|
|
75
|
+
return entity
|
|
76
|
+
|
|
77
|
+
# Handle batch of chunks
|
|
78
|
+
chunks = entity
|
|
79
|
+
if not chunks:
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
# Validate all chunks have document_id and embedding
|
|
83
|
+
for chunk in chunks:
|
|
84
|
+
assert chunk.document_id, "All chunks must have a document_id to be created"
|
|
85
|
+
assert chunk.embedding is not None, "All chunks must have embeddings"
|
|
86
|
+
|
|
87
|
+
# Prepare all chunk records
|
|
88
|
+
chunk_records = []
|
|
89
|
+
for chunk in chunks:
|
|
90
|
+
chunk_id = str(uuid4())
|
|
91
|
+
|
|
92
|
+
assert chunk.document_id is not None
|
|
93
|
+
chunk_record = self.store.ChunkRecord(
|
|
94
|
+
id=chunk_id,
|
|
95
|
+
document_id=chunk.document_id,
|
|
96
|
+
content=chunk.content,
|
|
97
|
+
content_fts=self._contextualize_content(chunk),
|
|
98
|
+
metadata=json.dumps(
|
|
99
|
+
{k: v for k, v in chunk.metadata.items() if k != "order"}
|
|
100
|
+
),
|
|
101
|
+
order=int(chunk.order),
|
|
102
|
+
vector=chunk.embedding,
|
|
103
|
+
)
|
|
104
|
+
chunk_records.append(chunk_record)
|
|
105
|
+
chunk.id = chunk_id
|
|
106
|
+
|
|
107
|
+
# Single batch insert for all chunks
|
|
108
|
+
self.store.chunks_table.add(chunk_records)
|
|
109
|
+
|
|
110
|
+
return chunks
|
|
68
111
|
|
|
69
112
|
async def get_by_id(self, entity_id: str) -> Chunk | None:
|
|
70
113
|
"""Get a chunk by its ID."""
|
|
@@ -89,28 +132,32 @@ class ChunkRepository:
|
|
|
89
132
|
)
|
|
90
133
|
|
|
91
134
|
async def update(self, entity: Chunk) -> Chunk:
|
|
92
|
-
"""Update an existing chunk.
|
|
93
|
-
assert entity.id, "Chunk ID is required for update"
|
|
135
|
+
"""Update an existing chunk.
|
|
94
136
|
|
|
95
|
-
embedding
|
|
96
|
-
|
|
137
|
+
Chunk must have embedding set before calling this method.
|
|
138
|
+
"""
|
|
139
|
+
self.store._assert_writable()
|
|
140
|
+
assert entity.id, "Chunk ID is required for update"
|
|
141
|
+
assert entity.embedding is not None, "Chunk must have an embedding"
|
|
97
142
|
|
|
98
143
|
self.store.chunks_table.update(
|
|
99
144
|
where=f"id = '{entity.id}'",
|
|
100
145
|
values={
|
|
101
146
|
"document_id": entity.document_id,
|
|
102
147
|
"content": entity.content,
|
|
148
|
+
"content_fts": self._contextualize_content(entity),
|
|
103
149
|
"metadata": json.dumps(
|
|
104
150
|
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
105
151
|
),
|
|
106
|
-
"order":
|
|
107
|
-
"vector": embedding,
|
|
152
|
+
"order": int(entity.order),
|
|
153
|
+
"vector": entity.embedding,
|
|
108
154
|
},
|
|
109
155
|
)
|
|
110
156
|
return entity
|
|
111
157
|
|
|
112
158
|
async def delete(self, entity_id: str) -> bool:
|
|
113
159
|
"""Delete a chunk by its ID."""
|
|
160
|
+
self.store._assert_writable()
|
|
114
161
|
chunk = await self.get_by_id(entity_id)
|
|
115
162
|
if chunk is None:
|
|
116
163
|
return False
|
|
@@ -145,86 +192,22 @@ class ChunkRepository:
|
|
|
145
192
|
)
|
|
146
193
|
return chunks
|
|
147
194
|
|
|
148
|
-
async def create_chunks_for_document(
|
|
149
|
-
self, document_id: str, document: "DoclingDocument"
|
|
150
|
-
) -> list[Chunk]:
|
|
151
|
-
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
152
|
-
# Lazy imports to avoid loading docling during module import
|
|
153
|
-
from haiku.rag.chunker import chunker
|
|
154
|
-
from haiku.rag.utils import text_to_docling_document
|
|
155
|
-
|
|
156
|
-
# Optionally preprocess markdown before chunking
|
|
157
|
-
processed_document = document
|
|
158
|
-
preprocessor_path = self.store._config.processing.markdown_preprocessor
|
|
159
|
-
if preprocessor_path:
|
|
160
|
-
try:
|
|
161
|
-
pre_fn = load_callable(preprocessor_path)
|
|
162
|
-
markdown = document.export_to_markdown()
|
|
163
|
-
result = pre_fn(markdown)
|
|
164
|
-
if inspect.isawaitable(result):
|
|
165
|
-
result = await result # type: ignore[assignment]
|
|
166
|
-
processed_markdown = result
|
|
167
|
-
if not isinstance(processed_markdown, str):
|
|
168
|
-
raise ValueError("Preprocessor must return a markdown string")
|
|
169
|
-
processed_document = text_to_docling_document(
|
|
170
|
-
processed_markdown, name="content.md"
|
|
171
|
-
)
|
|
172
|
-
except Exception as e:
|
|
173
|
-
logger.error(
|
|
174
|
-
f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
|
|
175
|
-
)
|
|
176
|
-
raise e
|
|
177
|
-
|
|
178
|
-
chunk_texts = await chunker.chunk(processed_document)
|
|
179
|
-
|
|
180
|
-
embeddings = await self.embedder.embed(chunk_texts)
|
|
181
|
-
|
|
182
|
-
# Prepare all chunk records for batch insertion
|
|
183
|
-
chunk_records = []
|
|
184
|
-
created_chunks = []
|
|
185
|
-
|
|
186
|
-
for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
|
|
187
|
-
chunk_id = str(uuid4())
|
|
188
|
-
|
|
189
|
-
chunk_record = self.store.ChunkRecord(
|
|
190
|
-
id=chunk_id,
|
|
191
|
-
document_id=document_id,
|
|
192
|
-
content=chunk_text,
|
|
193
|
-
metadata=json.dumps({}),
|
|
194
|
-
order=order,
|
|
195
|
-
vector=embedding,
|
|
196
|
-
)
|
|
197
|
-
chunk_records.append(chunk_record)
|
|
198
|
-
|
|
199
|
-
chunk = Chunk(
|
|
200
|
-
id=chunk_id,
|
|
201
|
-
document_id=document_id,
|
|
202
|
-
content=chunk_text,
|
|
203
|
-
metadata={},
|
|
204
|
-
order=order,
|
|
205
|
-
)
|
|
206
|
-
created_chunks.append(chunk)
|
|
207
|
-
|
|
208
|
-
# Batch insert all chunks at once
|
|
209
|
-
if chunk_records:
|
|
210
|
-
self.store.chunks_table.add(chunk_records)
|
|
211
|
-
|
|
212
|
-
return created_chunks
|
|
213
|
-
|
|
214
195
|
async def delete_all(self) -> None:
|
|
215
196
|
"""Delete all chunks from the database."""
|
|
197
|
+
self.store._assert_writable()
|
|
216
198
|
# Drop and recreate table to clear all data
|
|
217
199
|
self.store.db.drop_table("chunks")
|
|
218
200
|
self.store.chunks_table = self.store.db.create_table(
|
|
219
201
|
"chunks", schema=self.store.ChunkRecord
|
|
220
202
|
)
|
|
221
|
-
# Create FTS index on
|
|
203
|
+
# Create FTS index on content_fts (contextualized content) for better search
|
|
222
204
|
self.store.chunks_table.create_fts_index(
|
|
223
|
-
"
|
|
205
|
+
"content_fts", replace=True, with_position=True, remove_stop_words=False
|
|
224
206
|
)
|
|
225
207
|
|
|
226
208
|
async def delete_by_document_id(self, document_id: str) -> bool:
|
|
227
209
|
"""Delete all chunks for a document."""
|
|
210
|
+
self.store._assert_writable()
|
|
228
211
|
chunks = await self.get_by_document_id(document_id)
|
|
229
212
|
|
|
230
213
|
if not chunks:
|
|
@@ -272,25 +255,34 @@ class ChunkRepository:
|
|
|
272
255
|
|
|
273
256
|
# Prepare search query based on search type
|
|
274
257
|
if search_type == "vector":
|
|
275
|
-
query_embedding = await self.embedder.
|
|
276
|
-
|
|
277
|
-
|
|
258
|
+
query_embedding = await self.embedder.embed_query(query)
|
|
259
|
+
vector_query = cast(
|
|
260
|
+
"LanceVectorQueryBuilder",
|
|
261
|
+
self.store.chunks_table.search(
|
|
262
|
+
query_embedding, query_type="vector", vector_column_name="vector"
|
|
263
|
+
),
|
|
264
|
+
)
|
|
265
|
+
results = vector_query.refine_factor(
|
|
266
|
+
self.store._config.search.vector_refine_factor
|
|
278
267
|
)
|
|
279
268
|
|
|
280
269
|
elif search_type == "fts":
|
|
281
270
|
results = self.store.chunks_table.search(query, query_type="fts")
|
|
282
271
|
|
|
283
272
|
else: # hybrid (default)
|
|
284
|
-
query_embedding = await self.embedder.
|
|
273
|
+
query_embedding = await self.embedder.embed_query(query)
|
|
285
274
|
# Create RRF reranker
|
|
286
275
|
reranker = RRFReranker()
|
|
287
276
|
# Perform native hybrid search with RRF reranking
|
|
288
|
-
|
|
277
|
+
hybrid_query = cast(
|
|
278
|
+
"LanceHybridQueryBuilder",
|
|
289
279
|
self.store.chunks_table.search(query_type="hybrid")
|
|
290
280
|
.vector(query_embedding)
|
|
291
|
-
.text(query)
|
|
292
|
-
.rerank(reranker)
|
|
281
|
+
.text(query),
|
|
293
282
|
)
|
|
283
|
+
results = hybrid_query.refine_factor(
|
|
284
|
+
self.store._config.search.vector_refine_factor
|
|
285
|
+
).rerank(reranker)
|
|
294
286
|
|
|
295
287
|
# Apply filtering if needed (common for all search types)
|
|
296
288
|
if filtered_doc_ids is not None:
|
|
@@ -304,13 +296,30 @@ class ChunkRepository:
|
|
|
304
296
|
results = results.limit(limit)
|
|
305
297
|
return await self._process_search_results(results)
|
|
306
298
|
|
|
307
|
-
async def get_by_document_id(
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
299
|
+
async def get_by_document_id(
|
|
300
|
+
self,
|
|
301
|
+
document_id: str,
|
|
302
|
+
limit: int | None = None,
|
|
303
|
+
offset: int | None = None,
|
|
304
|
+
) -> list[Chunk]:
|
|
305
|
+
"""Get chunks for a specific document with optional pagination.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
document_id: The document ID to get chunks for.
|
|
309
|
+
limit: Maximum number of chunks to return. None for all.
|
|
310
|
+
offset: Number of chunks to skip. None for no offset.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
List of chunks ordered by their order field.
|
|
314
|
+
"""
|
|
315
|
+
query = self.store.chunks_table.search().where(f"document_id = '{document_id}'")
|
|
316
|
+
|
|
317
|
+
if offset is not None:
|
|
318
|
+
query = query.offset(offset)
|
|
319
|
+
if limit is not None:
|
|
320
|
+
query = query.limit(limit)
|
|
321
|
+
|
|
322
|
+
results = list(query.to_pydantic(self.store.ChunkRecord))
|
|
314
323
|
|
|
315
324
|
# Get document info
|
|
316
325
|
doc_results = list(
|
|
@@ -343,6 +352,16 @@ class ChunkRepository:
|
|
|
343
352
|
chunks.sort(key=lambda c: c.order)
|
|
344
353
|
return chunks
|
|
345
354
|
|
|
355
|
+
async def count_by_document_id(self, document_id: str) -> int:
|
|
356
|
+
"""Count the number of chunks for a specific document."""
|
|
357
|
+
df = (
|
|
358
|
+
self.store.chunks_table.search()
|
|
359
|
+
.select(["id"])
|
|
360
|
+
.where(f"document_id = '{document_id}'")
|
|
361
|
+
.to_pandas()
|
|
362
|
+
)
|
|
363
|
+
return len(df)
|
|
364
|
+
|
|
346
365
|
async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
|
|
347
366
|
"""Get adjacent chunks before and after the given chunk within the same document."""
|
|
348
367
|
assert chunk.document_id, "Document id is required for adjacent chunk finding"
|
|
@@ -400,6 +419,7 @@ class ChunkRepository:
|
|
|
400
419
|
id=str(row["id"]),
|
|
401
420
|
document_id=str(row["document_id"]),
|
|
402
421
|
content=str(row["content"]),
|
|
422
|
+
content_fts=str(row.get("content_fts", "")),
|
|
403
423
|
metadata=str(row["metadata"]),
|
|
404
424
|
order=int(row["order"]) if "order" in row else 0,
|
|
405
425
|
)
|