haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,4 @@
1
+ class ReadOnlyError(Exception):
2
+ """Raised when a write operation is attempted on a read-only store."""
3
+
4
+ pass
@@ -1,4 +1,10 @@
1
- from .chunk import Chunk
1
+ from .chunk import BoundingBox, Chunk, ChunkMetadata, SearchResult
2
2
  from .document import Document
3
3
 
4
- __all__ = ["Chunk", "Document"]
4
+ __all__ = [
5
+ "BoundingBox",
6
+ "Chunk",
7
+ "ChunkMetadata",
8
+ "Document",
9
+ "SearchResult",
10
+ ]
@@ -1,5 +1,92 @@
1
+ from typing import TYPE_CHECKING
2
+
1
3
  from pydantic import BaseModel
2
4
 
5
+ if TYPE_CHECKING:
6
+ from docling_core.types.doc.document import DocItem, DoclingDocument
7
+
8
+
9
+ class BoundingBox(BaseModel):
10
+ """Bounding box coordinates for visual grounding."""
11
+
12
+ page_no: int
13
+ left: float
14
+ top: float
15
+ right: float
16
+ bottom: float
17
+
18
+
19
+ class ChunkMetadata(BaseModel):
20
+ """
21
+ Structured metadata for a chunk, including DoclingDocument references.
22
+
23
+ Attributes:
24
+ doc_item_refs: JSON pointer references to DocItems in the parent DoclingDocument
25
+ (e.g., ["#/texts/5", "#/texts/6", "#/tables/0"])
26
+ headings: Section heading hierarchy for this chunk
27
+ (e.g., ["Chapter 1", "Section 1.1"])
28
+ labels: Semantic labels for each doc_item (e.g., ["paragraph", "table"])
29
+ page_numbers: Page numbers where the chunk content appears
30
+ """
31
+
32
+ doc_item_refs: list[str] = []
33
+ headings: list[str] | None = None
34
+ labels: list[str] = []
35
+ page_numbers: list[int] = []
36
+
37
+ def resolve_doc_items(self, docling_document: "DoclingDocument") -> list["DocItem"]:
38
+ """Resolve doc_item_refs to actual DocItem objects.
39
+
40
+ Args:
41
+ docling_document: The parent DoclingDocument containing the items.
42
+
43
+ Returns:
44
+ List of resolved DocItem objects. Items that fail to resolve are skipped.
45
+ """
46
+ from docling_core.types.doc.document import RefItem
47
+
48
+ doc_items = []
49
+ for ref in self.doc_item_refs:
50
+ try:
51
+ ref_item = RefItem.model_validate({"$ref": ref})
52
+ doc_item = ref_item.resolve(docling_document)
53
+ doc_items.append(doc_item)
54
+ except Exception:
55
+ # Graceful degradation: skip refs that can't be resolved
56
+ continue
57
+ return doc_items
58
+
59
+ def resolve_bounding_boxes(
60
+ self, docling_document: "DoclingDocument"
61
+ ) -> list[BoundingBox]:
62
+ """Resolve doc_item_refs to bounding boxes for visual grounding.
63
+
64
+ Args:
65
+ docling_document: The parent DoclingDocument containing the items.
66
+
67
+ Returns:
68
+ List of BoundingBox objects from resolved DocItems' provenance.
69
+ """
70
+ bounding_boxes = []
71
+ for doc_item in self.resolve_doc_items(docling_document):
72
+ prov = getattr(doc_item, "prov", None)
73
+ if not prov:
74
+ continue
75
+ for prov_item in prov:
76
+ bbox = getattr(prov_item, "bbox", None)
77
+ if bbox is None:
78
+ continue
79
+ bounding_boxes.append(
80
+ BoundingBox(
81
+ page_no=prov_item.page_no,
82
+ left=bbox.l,
83
+ top=bbox.t,
84
+ right=bbox.r,
85
+ bottom=bbox.b,
86
+ )
87
+ )
88
+ return bounding_boxes
89
+
3
90
 
4
91
  class Chunk(BaseModel):
5
92
  """
@@ -15,3 +102,106 @@ class Chunk(BaseModel):
15
102
  document_title: str | None = None
16
103
  document_meta: dict = {}
17
104
  embedding: list[float] | None = None
105
+
106
+ def get_chunk_metadata(self) -> ChunkMetadata:
107
+ """Parse metadata dict into structured ChunkMetadata."""
108
+ return ChunkMetadata.model_validate(self.metadata)
109
+
110
+
111
+ class SearchResult(BaseModel):
112
+ """Search result with optional provenance information for citations."""
113
+
114
+ content: str
115
+ score: float
116
+ chunk_id: str | None = None
117
+ document_id: str | None = None
118
+ document_uri: str | None = None
119
+ document_title: str | None = None
120
+ doc_item_refs: list[str] = []
121
+ page_numbers: list[int] = []
122
+ headings: list[str] | None = None
123
+ labels: list[str] = []
124
+
125
+ @classmethod
126
+ def from_chunk(
127
+ cls,
128
+ chunk: "Chunk",
129
+ score: float,
130
+ ) -> "SearchResult":
131
+ """Create from a Chunk."""
132
+ meta = chunk.get_chunk_metadata()
133
+ return cls(
134
+ content=chunk.content,
135
+ score=score,
136
+ chunk_id=chunk.id,
137
+ document_id=chunk.document_id,
138
+ document_uri=chunk.document_uri,
139
+ document_title=chunk.document_title,
140
+ doc_item_refs=meta.doc_item_refs,
141
+ page_numbers=meta.page_numbers,
142
+ headings=meta.headings,
143
+ labels=meta.labels,
144
+ )
145
+
146
+ def format_for_agent(self) -> str:
147
+ """Format this search result for inclusion in agent context.
148
+
149
+ Produces a structured format with metadata that helps LLMs understand
150
+ the source and nature of the content.
151
+ """
152
+ parts = [f"[{self.chunk_id}] (score: {self.score:.2f})"]
153
+
154
+ # Document source info
155
+ source_parts = []
156
+ if self.document_title:
157
+ source_parts.append(f'"{self.document_title}"')
158
+ if self.headings:
159
+ source_parts.append(" > ".join(self.headings))
160
+ if source_parts:
161
+ parts.append(f"Source: {' > '.join(source_parts)}")
162
+
163
+ # Content type (use primary label if available)
164
+ if self.labels:
165
+ primary_label = self._get_primary_label()
166
+ if primary_label:
167
+ parts.append(f"Type: {primary_label}")
168
+
169
+ # The actual content
170
+ parts.append(f"Content:\n{self.content}")
171
+
172
+ return "\n".join(parts)
173
+
174
+ def _get_primary_label(self) -> str | None:
175
+ """Get the most significant label for display.
176
+
177
+ Prioritizes structural labels over text labels.
178
+ """
179
+ if not self.labels:
180
+ return None
181
+
182
+ # Priority order: structural > contextual > text
183
+ priority = {
184
+ "table": 1,
185
+ "code": 2,
186
+ "form": 3,
187
+ "key_value_region": 4,
188
+ "list_item": 5,
189
+ "formula": 6,
190
+ "chart": 7,
191
+ "picture": 8,
192
+ "caption": 9,
193
+ "footnote": 10,
194
+ "section_header": 11,
195
+ "title": 12,
196
+ }
197
+
198
+ # Find highest priority label
199
+ best_label = None
200
+ best_priority = float("inf")
201
+ for label in self.labels:
202
+ if label in priority and priority[label] < best_priority:
203
+ best_label = label
204
+ best_priority = priority[label]
205
+
206
+ # Return best structural/special label, or first label if all are text
207
+ return best_label if best_label else self.labels[0]
@@ -1,7 +1,32 @@
1
1
  from datetime import datetime
2
+ from typing import TYPE_CHECKING
2
3
 
4
+ from cachetools import LRUCache
3
5
  from pydantic import BaseModel, Field
4
6
 
7
+ if TYPE_CHECKING:
8
+ from docling_core.types.doc.document import DoclingDocument
9
+
10
+
11
+ _docling_document_cache: LRUCache[str, "DoclingDocument"] = LRUCache(maxsize=100)
12
+
13
+
14
+ def _get_cached_docling_document(document_id: str, json_str: str) -> "DoclingDocument":
15
+ """Get or parse DoclingDocument with LRU caching by document ID."""
16
+ if document_id in _docling_document_cache:
17
+ return _docling_document_cache[document_id]
18
+
19
+ from docling_core.types.doc.document import DoclingDocument
20
+
21
+ doc = DoclingDocument.model_validate_json(json_str)
22
+ _docling_document_cache[document_id] = doc
23
+ return doc
24
+
25
+
26
+ def invalidate_docling_document_cache(document_id: str) -> None:
27
+ """Remove a document from the DoclingDocument cache."""
28
+ _docling_document_cache.pop(document_id, None)
29
+
5
30
 
6
31
  class Document(BaseModel):
7
32
  """
@@ -13,5 +38,26 @@ class Document(BaseModel):
13
38
  uri: str | None = None
14
39
  title: str | None = None
15
40
  metadata: dict = {}
41
+ docling_document_json: str | None = None
42
+ docling_version: str | None = None
16
43
  created_at: datetime = Field(default_factory=datetime.now)
17
44
  updated_at: datetime = Field(default_factory=datetime.now)
45
+
46
+ def get_docling_document(self) -> "DoclingDocument | None":
47
+ """Parse and return the stored DoclingDocument.
48
+
49
+ Uses LRU cache (keyed by document ID) to avoid repeated parsing.
50
+
51
+ Returns:
52
+ The parsed DoclingDocument, or None if not stored or no ID.
53
+ """
54
+ if self.docling_document_json is None:
55
+ return None
56
+
57
+ # No caching for documents without ID
58
+ if self.id is None:
59
+ from docling_core.types.doc.document import DoclingDocument
60
+
61
+ return DoclingDocument.model_validate_json(self.docling_document_json)
62
+
63
+ return _get_cached_docling_document(self.id, self.docling_document_json)
@@ -1,21 +1,20 @@
1
- import inspect
2
1
  import json
3
2
  import logging
4
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, cast
5
4
  from uuid import uuid4
6
5
 
7
6
  if TYPE_CHECKING:
8
7
  import pandas as pd
9
- from lancedb.query import LanceQueryBuilder
8
+ from lancedb.query import (
9
+ LanceHybridQueryBuilder,
10
+ LanceQueryBuilder,
11
+ LanceVectorQueryBuilder,
12
+ )
10
13
 
11
14
  from lancedb.rerankers import RRFReranker
12
15
 
13
16
  from haiku.rag.store.engine import DocumentRecord, Store
14
17
  from haiku.rag.store.models.chunk import Chunk
15
- from haiku.rag.utils import load_callable
16
-
17
- if TYPE_CHECKING:
18
- from docling_core.types.doc.document import DoclingDocument
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
20
 
@@ -28,43 +27,87 @@ class ChunkRepository:
28
27
  self.embedder = store.embedder
29
28
 
30
29
  def _ensure_fts_index(self) -> None:
31
- """Ensure FTS index exists on the content column."""
30
+ """Ensure FTS index exists on the content_fts column."""
32
31
  try:
33
32
  self.store.chunks_table.create_fts_index(
34
- "content", replace=True, with_position=True, remove_stop_words=False
33
+ "content_fts", replace=True, with_position=True, remove_stop_words=False
35
34
  )
36
35
  except Exception as e:
37
36
  # Log the error but don't fail - FTS might already exist
38
37
  logger.debug(f"FTS index creation skipped: {e}")
39
38
 
40
- async def create(self, entity: Chunk) -> Chunk:
41
- """Create a chunk in the database."""
42
- assert entity.document_id, "Chunk must have a document_id to be created"
39
+ def _contextualize_content(self, chunk: Chunk) -> str:
40
+ """Generate contextualized content for FTS by prepending headings."""
41
+ meta = chunk.get_chunk_metadata()
42
+ if meta.headings:
43
+ return "\n".join(meta.headings) + "\n" + chunk.content
44
+ return chunk.content
43
45
 
44
- chunk_id = str(uuid4())
46
+ async def create(self, entity: Chunk | list[Chunk]) -> Chunk | list[Chunk]:
47
+ """Create one or more chunks in the database.
45
48
 
46
- # Generate embedding if not provided
47
- if entity.embedding is not None:
48
- embedding = entity.embedding
49
- else:
50
- embedding = await self.embedder.embed(entity.content)
51
- order_val = int(entity.order)
52
-
53
- chunk_record = self.store.ChunkRecord(
54
- id=chunk_id,
55
- document_id=entity.document_id,
56
- content=entity.content,
57
- metadata=json.dumps(
58
- {k: v for k, v in entity.metadata.items() if k != "order"}
59
- ),
60
- order=order_val,
61
- vector=embedding,
62
- )
49
+ Chunks must have embeddings set before calling this method.
50
+ Use client._ensure_chunks_embedded() to embed chunks if needed.
51
+ """
52
+ self.store._assert_writable()
53
+ # Handle single chunk
54
+ if isinstance(entity, Chunk):
55
+ assert entity.document_id, "Chunk must have a document_id to be created"
56
+ assert entity.embedding is not None, "Chunk must have an embedding"
63
57
 
64
- self.store.chunks_table.add([chunk_record])
58
+ chunk_id = str(uuid4())
65
59
 
66
- entity.id = chunk_id
67
- return entity
60
+ chunk_record = self.store.ChunkRecord(
61
+ id=chunk_id,
62
+ document_id=entity.document_id,
63
+ content=entity.content,
64
+ content_fts=self._contextualize_content(entity),
65
+ metadata=json.dumps(
66
+ {k: v for k, v in entity.metadata.items() if k != "order"}
67
+ ),
68
+ order=int(entity.order),
69
+ vector=entity.embedding,
70
+ )
71
+
72
+ self.store.chunks_table.add([chunk_record])
73
+
74
+ entity.id = chunk_id
75
+ return entity
76
+
77
+ # Handle batch of chunks
78
+ chunks = entity
79
+ if not chunks:
80
+ return []
81
+
82
+ # Validate all chunks have document_id and embedding
83
+ for chunk in chunks:
84
+ assert chunk.document_id, "All chunks must have a document_id to be created"
85
+ assert chunk.embedding is not None, "All chunks must have embeddings"
86
+
87
+ # Prepare all chunk records
88
+ chunk_records = []
89
+ for chunk in chunks:
90
+ chunk_id = str(uuid4())
91
+
92
+ assert chunk.document_id is not None
93
+ chunk_record = self.store.ChunkRecord(
94
+ id=chunk_id,
95
+ document_id=chunk.document_id,
96
+ content=chunk.content,
97
+ content_fts=self._contextualize_content(chunk),
98
+ metadata=json.dumps(
99
+ {k: v for k, v in chunk.metadata.items() if k != "order"}
100
+ ),
101
+ order=int(chunk.order),
102
+ vector=chunk.embedding,
103
+ )
104
+ chunk_records.append(chunk_record)
105
+ chunk.id = chunk_id
106
+
107
+ # Single batch insert for all chunks
108
+ self.store.chunks_table.add(chunk_records)
109
+
110
+ return chunks
68
111
 
69
112
  async def get_by_id(self, entity_id: str) -> Chunk | None:
70
113
  """Get a chunk by its ID."""
@@ -89,28 +132,32 @@ class ChunkRepository:
89
132
  )
90
133
 
91
134
  async def update(self, entity: Chunk) -> Chunk:
92
- """Update an existing chunk."""
93
- assert entity.id, "Chunk ID is required for update"
135
+ """Update an existing chunk.
94
136
 
95
- embedding = await self.embedder.embed(entity.content)
96
- order_val = int(entity.order)
137
+ Chunk must have embedding set before calling this method.
138
+ """
139
+ self.store._assert_writable()
140
+ assert entity.id, "Chunk ID is required for update"
141
+ assert entity.embedding is not None, "Chunk must have an embedding"
97
142
 
98
143
  self.store.chunks_table.update(
99
144
  where=f"id = '{entity.id}'",
100
145
  values={
101
146
  "document_id": entity.document_id,
102
147
  "content": entity.content,
148
+ "content_fts": self._contextualize_content(entity),
103
149
  "metadata": json.dumps(
104
150
  {k: v for k, v in entity.metadata.items() if k != "order"}
105
151
  ),
106
- "order": order_val,
107
- "vector": embedding,
152
+ "order": int(entity.order),
153
+ "vector": entity.embedding,
108
154
  },
109
155
  )
110
156
  return entity
111
157
 
112
158
  async def delete(self, entity_id: str) -> bool:
113
159
  """Delete a chunk by its ID."""
160
+ self.store._assert_writable()
114
161
  chunk = await self.get_by_id(entity_id)
115
162
  if chunk is None:
116
163
  return False
@@ -145,86 +192,22 @@ class ChunkRepository:
145
192
  )
146
193
  return chunks
147
194
 
148
- async def create_chunks_for_document(
149
- self, document_id: str, document: "DoclingDocument"
150
- ) -> list[Chunk]:
151
- """Create chunks and embeddings for a document from DoclingDocument."""
152
- # Lazy imports to avoid loading docling during module import
153
- from haiku.rag.chunker import chunker
154
- from haiku.rag.utils import text_to_docling_document
155
-
156
- # Optionally preprocess markdown before chunking
157
- processed_document = document
158
- preprocessor_path = self.store._config.processing.markdown_preprocessor
159
- if preprocessor_path:
160
- try:
161
- pre_fn = load_callable(preprocessor_path)
162
- markdown = document.export_to_markdown()
163
- result = pre_fn(markdown)
164
- if inspect.isawaitable(result):
165
- result = await result # type: ignore[assignment]
166
- processed_markdown = result
167
- if not isinstance(processed_markdown, str):
168
- raise ValueError("Preprocessor must return a markdown string")
169
- processed_document = text_to_docling_document(
170
- processed_markdown, name="content.md"
171
- )
172
- except Exception as e:
173
- logger.error(
174
- f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
175
- )
176
- raise e
177
-
178
- chunk_texts = await chunker.chunk(processed_document)
179
-
180
- embeddings = await self.embedder.embed(chunk_texts)
181
-
182
- # Prepare all chunk records for batch insertion
183
- chunk_records = []
184
- created_chunks = []
185
-
186
- for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
187
- chunk_id = str(uuid4())
188
-
189
- chunk_record = self.store.ChunkRecord(
190
- id=chunk_id,
191
- document_id=document_id,
192
- content=chunk_text,
193
- metadata=json.dumps({}),
194
- order=order,
195
- vector=embedding,
196
- )
197
- chunk_records.append(chunk_record)
198
-
199
- chunk = Chunk(
200
- id=chunk_id,
201
- document_id=document_id,
202
- content=chunk_text,
203
- metadata={},
204
- order=order,
205
- )
206
- created_chunks.append(chunk)
207
-
208
- # Batch insert all chunks at once
209
- if chunk_records:
210
- self.store.chunks_table.add(chunk_records)
211
-
212
- return created_chunks
213
-
214
195
  async def delete_all(self) -> None:
215
196
  """Delete all chunks from the database."""
197
+ self.store._assert_writable()
216
198
  # Drop and recreate table to clear all data
217
199
  self.store.db.drop_table("chunks")
218
200
  self.store.chunks_table = self.store.db.create_table(
219
201
  "chunks", schema=self.store.ChunkRecord
220
202
  )
221
- # Create FTS index on the new table with phrase query support
203
+ # Create FTS index on content_fts (contextualized content) for better search
222
204
  self.store.chunks_table.create_fts_index(
223
- "content", replace=True, with_position=True, remove_stop_words=False
205
+ "content_fts", replace=True, with_position=True, remove_stop_words=False
224
206
  )
225
207
 
226
208
  async def delete_by_document_id(self, document_id: str) -> bool:
227
209
  """Delete all chunks for a document."""
210
+ self.store._assert_writable()
228
211
  chunks = await self.get_by_document_id(document_id)
229
212
 
230
213
  if not chunks:
@@ -272,25 +255,34 @@ class ChunkRepository:
272
255
 
273
256
  # Prepare search query based on search type
274
257
  if search_type == "vector":
275
- query_embedding = await self.embedder.embed(query)
276
- results = self.store.chunks_table.search(
277
- query_embedding, query_type="vector", vector_column_name="vector"
258
+ query_embedding = await self.embedder.embed_query(query)
259
+ vector_query = cast(
260
+ "LanceVectorQueryBuilder",
261
+ self.store.chunks_table.search(
262
+ query_embedding, query_type="vector", vector_column_name="vector"
263
+ ),
264
+ )
265
+ results = vector_query.refine_factor(
266
+ self.store._config.search.vector_refine_factor
278
267
  )
279
268
 
280
269
  elif search_type == "fts":
281
270
  results = self.store.chunks_table.search(query, query_type="fts")
282
271
 
283
272
  else: # hybrid (default)
284
- query_embedding = await self.embedder.embed(query)
273
+ query_embedding = await self.embedder.embed_query(query)
285
274
  # Create RRF reranker
286
275
  reranker = RRFReranker()
287
276
  # Perform native hybrid search with RRF reranking
288
- results = (
277
+ hybrid_query = cast(
278
+ "LanceHybridQueryBuilder",
289
279
  self.store.chunks_table.search(query_type="hybrid")
290
280
  .vector(query_embedding)
291
- .text(query)
292
- .rerank(reranker)
281
+ .text(query),
293
282
  )
283
+ results = hybrid_query.refine_factor(
284
+ self.store._config.search.vector_refine_factor
285
+ ).rerank(reranker)
294
286
 
295
287
  # Apply filtering if needed (common for all search types)
296
288
  if filtered_doc_ids is not None:
@@ -304,13 +296,30 @@ class ChunkRepository:
304
296
  results = results.limit(limit)
305
297
  return await self._process_search_results(results)
306
298
 
307
- async def get_by_document_id(self, document_id: str) -> list[Chunk]:
308
- """Get all chunks for a specific document."""
309
- results = list(
310
- self.store.chunks_table.search()
311
- .where(f"document_id = '{document_id}'")
312
- .to_pydantic(self.store.ChunkRecord)
313
- )
299
+ async def get_by_document_id(
300
+ self,
301
+ document_id: str,
302
+ limit: int | None = None,
303
+ offset: int | None = None,
304
+ ) -> list[Chunk]:
305
+ """Get chunks for a specific document with optional pagination.
306
+
307
+ Args:
308
+ document_id: The document ID to get chunks for.
309
+ limit: Maximum number of chunks to return. None for all.
310
+ offset: Number of chunks to skip. None for no offset.
311
+
312
+ Returns:
313
+ List of chunks ordered by their order field.
314
+ """
315
+ query = self.store.chunks_table.search().where(f"document_id = '{document_id}'")
316
+
317
+ if offset is not None:
318
+ query = query.offset(offset)
319
+ if limit is not None:
320
+ query = query.limit(limit)
321
+
322
+ results = list(query.to_pydantic(self.store.ChunkRecord))
314
323
 
315
324
  # Get document info
316
325
  doc_results = list(
@@ -343,6 +352,16 @@ class ChunkRepository:
343
352
  chunks.sort(key=lambda c: c.order)
344
353
  return chunks
345
354
 
355
+ async def count_by_document_id(self, document_id: str) -> int:
356
+ """Count the number of chunks for a specific document."""
357
+ df = (
358
+ self.store.chunks_table.search()
359
+ .select(["id"])
360
+ .where(f"document_id = '{document_id}'")
361
+ .to_pandas()
362
+ )
363
+ return len(df)
364
+
346
365
  async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
347
366
  """Get adjacent chunks before and after the given chunk within the same document."""
348
367
  assert chunk.document_id, "Document id is required for adjacent chunk finding"
@@ -400,6 +419,7 @@ class ChunkRepository:
400
419
  id=str(row["id"]),
401
420
  document_id=str(row["document_id"]),
402
421
  content=str(row["content"]),
422
+ content_fts=str(row.get("content_fts", "")),
403
423
  metadata=str(row["metadata"]),
404
424
  order=int(row["order"]) if "order" in row else 0,
405
425
  )