haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,14 @@
1
- import asyncio
2
1
  import json
3
2
  from datetime import datetime
4
- from typing import TYPE_CHECKING
5
3
  from uuid import uuid4
6
4
 
7
5
  from haiku.rag.store.engine import DocumentRecord, Store
8
6
  from haiku.rag.store.models.document import Document
9
7
 
10
- if TYPE_CHECKING:
11
- from docling_core.types.doc.document import DoclingDocument
12
8
 
13
- from haiku.rag.store.models.chunk import Chunk
9
+ def _escape_sql_string(value: str) -> str:
10
+ """Escape single quotes in SQL string literals."""
11
+ return value.replace("'", "''")
14
12
 
15
13
 
16
14
  class DocumentRepository:
@@ -37,6 +35,8 @@ class DocumentRepository:
37
35
  uri=record.uri,
38
36
  title=record.title,
39
37
  metadata=json.loads(record.metadata),
38
+ docling_document_json=record.docling_document_json,
39
+ docling_version=record.docling_version,
40
40
  created_at=datetime.fromisoformat(record.created_at)
41
41
  if record.created_at
42
42
  else datetime.now(),
@@ -47,6 +47,7 @@ class DocumentRepository:
47
47
 
48
48
  async def create(self, entity: Document) -> Document:
49
49
  """Create a document in the database."""
50
+ self.store._assert_writable()
50
51
  # Generate new UUID
51
52
  doc_id = str(uuid4())
52
53
 
@@ -60,6 +61,8 @@ class DocumentRepository:
60
61
  uri=entity.uri,
61
62
  title=entity.title,
62
63
  metadata=json.dumps(entity.metadata),
64
+ docling_document_json=entity.docling_document_json,
65
+ docling_version=entity.docling_version,
63
66
  created_at=now,
64
67
  updated_at=now,
65
68
  )
@@ -88,8 +91,14 @@ class DocumentRepository:
88
91
 
89
92
  async def update(self, entity: Document) -> Document:
90
93
  """Update an existing document."""
94
+ self.store._assert_writable()
95
+ from haiku.rag.store.models.document import invalidate_docling_document_cache
96
+
91
97
  assert entity.id, "Document ID is required for update"
92
98
 
99
+ # Invalidate cache before update
100
+ invalidate_docling_document_cache(entity.id)
101
+
93
102
  # Update timestamp
94
103
  now = datetime.now().isoformat()
95
104
  entity.updated_at = datetime.fromisoformat(now)
@@ -102,6 +111,8 @@ class DocumentRepository:
102
111
  "uri": entity.uri,
103
112
  "title": entity.title,
104
113
  "metadata": json.dumps(entity.metadata),
114
+ "docling_document_json": entity.docling_document_json,
115
+ "docling_version": entity.docling_version,
105
116
  "updated_at": now,
106
117
  },
107
118
  )
@@ -110,11 +121,17 @@ class DocumentRepository:
110
121
 
111
122
  async def delete(self, entity_id: str) -> bool:
112
123
  """Delete a document by its ID."""
124
+ self.store._assert_writable()
125
+ from haiku.rag.store.models.document import invalidate_docling_document_cache
126
+
113
127
  # Check if document exists
114
128
  doc = await self.get_by_id(entity_id)
115
129
  if doc is None:
116
130
  return False
117
131
 
132
+ # Invalidate cache before delete
133
+ invalidate_docling_document_cache(entity_id)
134
+
118
135
  # Delete associated chunks first
119
136
  await self.chunk_repository.delete_by_document_id(entity_id)
120
137
 
@@ -152,9 +169,10 @@ class DocumentRepository:
152
169
 
153
170
  async def get_by_uri(self, uri: str) -> Document | None:
154
171
  """Get a document by its URI."""
172
+ escaped_uri = _escape_sql_string(uri)
155
173
  results = list(
156
174
  self.store.documents_table.search()
157
- .where(f"uri = '{uri}'")
175
+ .where(f"uri = '{escaped_uri}'")
158
176
  .limit(1)
159
177
  .to_pydantic(DocumentRecord)
160
178
  )
@@ -166,6 +184,7 @@ class DocumentRepository:
166
184
 
167
185
  async def delete_all(self) -> None:
168
186
  """Delete all documents from the database."""
187
+ self.store._assert_writable()
169
188
  # Delete all chunks first
170
189
  await self.chunk_repository.delete_all()
171
190
 
@@ -181,81 +200,3 @@ class DocumentRepository:
181
200
  self.store.documents_table = self.store.db.create_table(
182
201
  "documents", schema=DocumentRecord
183
202
  )
184
-
185
- async def _create_and_chunk(
186
- self,
187
- entity: Document,
188
- docling_document: "DoclingDocument | None",
189
- chunks: list["Chunk"] | None = None,
190
- ) -> Document:
191
- """Create a document with its chunks and embeddings."""
192
- # Snapshot table versions for versioned rollback (if supported)
193
- versions = self.store.current_table_versions()
194
-
195
- # Create the document
196
- created_doc = await self.create(entity)
197
-
198
- # Attempt to create chunks; on failure, prefer version rollback
199
- try:
200
- # Create chunks if not provided
201
- if chunks is None:
202
- assert docling_document is not None, (
203
- "docling_document is required when chunks are not provided"
204
- )
205
- assert created_doc.id is not None, (
206
- "Document ID should not be None after creation"
207
- )
208
- await self.chunk_repository.create_chunks_for_document(
209
- created_doc.id, docling_document
210
- )
211
- else:
212
- # Use provided chunks, set order from list position
213
- assert created_doc.id is not None, (
214
- "Document ID should not be None after creation"
215
- )
216
- for order, chunk in enumerate(chunks):
217
- chunk.document_id = created_doc.id
218
- chunk.order = order
219
- await self.chunk_repository.create(chunk)
220
-
221
- # Vacuum old versions in background (non-blocking)
222
- asyncio.create_task(self.store.vacuum())
223
-
224
- return created_doc
225
- except Exception:
226
- # Roll back to the captured versions and re-raise
227
- self.store.restore_table_versions(versions)
228
- raise
229
-
230
- async def _update_and_rechunk(
231
- self, entity: Document, docling_document: "DoclingDocument"
232
- ) -> Document:
233
- """Update a document and regenerate its chunks."""
234
- assert entity.id is not None, "Document ID is required for update"
235
-
236
- # Snapshot table versions for versioned rollback
237
- versions = self.store.current_table_versions()
238
-
239
- # Delete existing chunks before writing new ones
240
- await self.chunk_repository.delete_by_document_id(entity.id)
241
-
242
- try:
243
- # Update the document
244
- updated_doc = await self.update(entity)
245
-
246
- # Create new chunks
247
- assert updated_doc.id is not None, (
248
- "Document ID should not be None after update"
249
- )
250
- await self.chunk_repository.create_chunks_for_document(
251
- updated_doc.id, docling_document
252
- )
253
-
254
- # Vacuum old versions in background (non-blocking)
255
- asyncio.create_task(self.store.vacuum())
256
-
257
- return updated_doc
258
- except Exception:
259
- # Roll back to the captured versions and re-raise
260
- self.store.restore_table_versions(versions)
261
- raise
@@ -72,6 +72,7 @@ class SettingsRepository:
72
72
 
73
73
  def save_current_settings(self) -> None:
74
74
  """Save the current configuration to the database."""
75
+ self.store._assert_writable()
75
76
  current_config = self.store._config.model_dump(mode="json")
76
77
 
77
78
  # Check if settings exist
@@ -118,25 +119,21 @@ class SettingsRepository:
118
119
  current_config = self.store._config.model_dump(mode="json")
119
120
 
120
121
  # Check if embedding provider or model has changed
121
- # Support both old flat structure and new nested structure for backward compatibility
122
+ # Both stored and current use nested structure: embeddings.model.{provider,name,vector_dim}
122
123
  stored_embeddings = stored_settings.get("embeddings", {})
123
124
  current_embeddings = current_config.get("embeddings", {})
124
125
 
125
- # Try nested structure first, fall back to flat for old databases
126
- stored_provider = stored_embeddings.get("provider") or stored_settings.get(
127
- "EMBEDDINGS_PROVIDER"
128
- )
129
- current_provider = current_embeddings.get("provider")
126
+ stored_model_obj = stored_embeddings.get("model", {})
127
+ current_model_obj = current_embeddings.get("model", {})
130
128
 
131
- stored_model = stored_embeddings.get("model") or stored_settings.get(
132
- "EMBEDDINGS_MODEL"
133
- )
134
- current_model = current_embeddings.get("model")
129
+ stored_provider = stored_model_obj.get("provider")
130
+ current_provider = current_model_obj.get("provider")
135
131
 
136
- stored_vector_dim = stored_embeddings.get("vector_dim") or stored_settings.get(
137
- "EMBEDDINGS_VECTOR_DIM"
138
- )
139
- current_vector_dim = current_embeddings.get("vector_dim")
132
+ stored_model = stored_model_obj.get("name")
133
+ current_model = current_model_obj.get("name")
134
+
135
+ stored_vector_dim = stored_model_obj.get("vector_dim")
136
+ current_vector_dim = current_model_obj.get("vector_dim")
140
137
 
141
138
  # Check for incompatible changes
142
139
  incompatible_changes = []
@@ -53,10 +53,26 @@ def run_pending_upgrades(store: Store, from_version: str, to_version: str) -> No
53
53
  logger.info("Completed upgrade %s", step.version)
54
54
 
55
55
 
56
- from .v0_9_3 import upgrade_fts_phrase as upgrade_0_9_3_fts # noqa: E402
57
- from .v0_9_3 import upgrade_order as upgrade_0_9_3_order # noqa: E402
58
- from .v0_10_1 import upgrade_add_title as upgrade_0_10_1_add_title # noqa: E402
56
+ # Import upgrade modules AFTER Upgrade class is defined to avoid circular imports
57
+ # ruff: noqa: E402, I001
58
+ from haiku.rag.store.upgrades.v0_9_3 import upgrade_fts_phrase as upgrade_0_9_3_fts
59
+ from haiku.rag.store.upgrades.v0_9_3 import upgrade_order as upgrade_0_9_3_order
60
+ from haiku.rag.store.upgrades.v0_10_1 import (
61
+ upgrade_add_title as upgrade_0_10_1_add_title,
62
+ )
63
+ from haiku.rag.store.upgrades.v0_19_6 import (
64
+ upgrade_embeddings_model_config as upgrade_0_19_6_embeddings,
65
+ )
66
+ from haiku.rag.store.upgrades.v0_20_0 import (
67
+ upgrade_add_docling_document as upgrade_0_20_0_docling,
68
+ )
69
+ from haiku.rag.store.upgrades.v0_23_1 import (
70
+ upgrade_contextualize_chunks as upgrade_0_23_1_contextualize,
71
+ )
59
72
 
60
73
  upgrades.append(upgrade_0_9_3_order)
61
74
  upgrades.append(upgrade_0_9_3_fts)
62
75
  upgrades.append(upgrade_0_10_1_add_title)
76
+ upgrades.append(upgrade_0_19_6_embeddings)
77
+ upgrades.append(upgrade_0_20_0_docling)
78
+ upgrades.append(upgrade_0_23_1_contextualize)
@@ -7,7 +7,7 @@ from haiku.rag.store.engine import Store
7
7
  from haiku.rag.store.upgrades import Upgrade
8
8
 
9
9
 
10
- def _apply_add_document_title(store: Store) -> None:
10
+ def _apply_add_document_title(store: Store) -> None: # pragma: no cover
11
11
  """Add a nullable 'title' column to the documents table."""
12
12
 
13
13
  # Read existing rows using Arrow for schema-agnostic access
@@ -0,0 +1,65 @@
1
+ import json
2
+ import logging
3
+
4
+ from haiku.rag.store.engine import SettingsRecord, Store
5
+ from haiku.rag.store.upgrades import Upgrade
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def _apply_embeddings_model_config(store: Store) -> None: # pragma: no cover
11
+ """Migrate embeddings config from flat to nested EmbeddingModelConfig structure."""
12
+ results = list(
13
+ store.settings_table.search()
14
+ .where("id = 'settings'")
15
+ .limit(1)
16
+ .to_pydantic(SettingsRecord)
17
+ )
18
+
19
+ if not results or not results[0].settings:
20
+ return
21
+
22
+ settings = json.loads(results[0].settings)
23
+ embeddings = settings.get("embeddings", {})
24
+
25
+ # Check if already migrated (model is a dict with nested structure)
26
+ if isinstance(embeddings.get("model"), dict):
27
+ return
28
+
29
+ # Migrate from flat structure to nested EmbeddingModelConfig
30
+ old_provider = embeddings.get("provider", "ollama")
31
+ old_model = embeddings.get("model", "qwen3-embedding:4b")
32
+ old_vector_dim = embeddings.get("vector_dim", 2560)
33
+
34
+ logger.info(
35
+ "Migrating embeddings config to new nested structure: "
36
+ "embeddings.{provider,model,vector_dim} -> embeddings.model.{provider,name,vector_dim}"
37
+ )
38
+
39
+ # Create new nested structure
40
+ settings["embeddings"] = {
41
+ "model": {
42
+ "provider": old_provider,
43
+ "name": old_model,
44
+ "vector_dim": old_vector_dim,
45
+ }
46
+ }
47
+
48
+ store.settings_table.update(
49
+ where="id = 'settings'",
50
+ values={"settings": json.dumps(settings)},
51
+ )
52
+
53
+ logger.info(
54
+ "Embeddings config migrated: provider=%s, name=%s, vector_dim=%d",
55
+ old_provider,
56
+ old_model,
57
+ old_vector_dim,
58
+ )
59
+
60
+
61
+ upgrade_embeddings_model_config = Upgrade(
62
+ version="0.19.6",
63
+ apply=_apply_embeddings_model_config,
64
+ description="Migrate embeddings config to nested EmbeddingModelConfig structure",
65
+ )
@@ -0,0 +1,68 @@
1
+ import json
2
+
3
+ from lancedb.pydantic import LanceModel
4
+ from pydantic import Field
5
+
6
+ from haiku.rag.store.engine import Store
7
+ from haiku.rag.store.upgrades import Upgrade
8
+
9
+
10
+ def _apply_add_docling_document_columns(store: Store) -> None: # pragma: no cover
11
+ """Add 'docling_document_json' and 'docling_version' columns to documents table."""
12
+
13
+ # Read existing rows using Arrow for schema-agnostic access
14
+ try:
15
+ docs_arrow = store.documents_table.search().to_arrow()
16
+ rows = docs_arrow.to_pylist()
17
+ except Exception:
18
+ rows = []
19
+
20
+ class DocumentRecordV3(LanceModel):
21
+ id: str
22
+ content: str
23
+ uri: str | None = None
24
+ title: str | None = None
25
+ metadata: str = Field(default="{}")
26
+ docling_document_json: str | None = None
27
+ docling_version: str | None = None
28
+ created_at: str = Field(default_factory=lambda: "")
29
+ updated_at: str = Field(default_factory=lambda: "")
30
+
31
+ # Drop and recreate documents table with the new schema
32
+ try:
33
+ store.db.drop_table("documents")
34
+ except Exception:
35
+ pass
36
+
37
+ store.documents_table = store.db.create_table("documents", schema=DocumentRecordV3)
38
+
39
+ # Reinsert previous rows with new columns as None
40
+ if rows:
41
+ backfilled = []
42
+ for row in rows:
43
+ backfilled.append(
44
+ DocumentRecordV3(
45
+ id=row.get("id"),
46
+ content=row.get("content", ""),
47
+ uri=row.get("uri"),
48
+ title=row.get("title"),
49
+ metadata=(
50
+ row.get("metadata")
51
+ if isinstance(row.get("metadata"), str)
52
+ else json.dumps(row.get("metadata") or {})
53
+ ),
54
+ docling_document_json=None,
55
+ docling_version=None,
56
+ created_at=row.get("created_at", ""),
57
+ updated_at=row.get("updated_at", ""),
58
+ )
59
+ )
60
+
61
+ store.documents_table.add(backfilled)
62
+
63
+
64
+ upgrade_add_docling_document = Upgrade(
65
+ version="0.20.0",
66
+ apply=_apply_add_docling_document_columns,
67
+ description="Add 'docling_document_json' and 'docling_version' columns to documents table",
68
+ )
@@ -0,0 +1,100 @@
1
+ import json
2
+
3
+ from lancedb.pydantic import LanceModel, Vector
4
+ from pydantic import Field
5
+
6
+ from haiku.rag.store.engine import Store
7
+ from haiku.rag.store.upgrades import Upgrade
8
+
9
+
10
+ def _apply_add_content_fts(store: Store) -> None: # pragma: no cover
11
+ """Add content_fts column with contextualized content for better FTS."""
12
+ # Read existing chunks
13
+ try:
14
+ chunks_arrow = store.chunks_table.search().to_arrow()
15
+ rows = chunks_arrow.to_pylist()
16
+ except Exception:
17
+ return
18
+
19
+ if not rows:
20
+ return
21
+
22
+ # Infer vector dimensions from first row
23
+ vec = rows[0].get("vector")
24
+ if not isinstance(vec, list) or not vec:
25
+ return
26
+ vector_dim = len(vec)
27
+
28
+ class ChunkRecord(LanceModel):
29
+ id: str
30
+ document_id: str
31
+ content: str
32
+ content_fts: str = Field(default="")
33
+ metadata: str = Field(default="{}")
34
+ order: int = Field(default=0)
35
+ vector: Vector(vector_dim) = Field( # type: ignore
36
+ default_factory=lambda: [0.0] * vector_dim
37
+ )
38
+
39
+ # Drop and recreate table with new schema
40
+ try:
41
+ store.db.drop_table("chunks")
42
+ except Exception:
43
+ pass
44
+
45
+ store.chunks_table = store.db.create_table("chunks", schema=ChunkRecord)
46
+
47
+ # Populate content_fts with contextualized content
48
+ new_records: list[ChunkRecord] = []
49
+ for row in rows:
50
+ metadata_raw = row.get("metadata") or "{}"
51
+ try:
52
+ metadata = (
53
+ json.loads(metadata_raw)
54
+ if isinstance(metadata_raw, str)
55
+ else metadata_raw
56
+ )
57
+ except Exception:
58
+ metadata = {}
59
+
60
+ headings = metadata.get("headings") if isinstance(metadata, dict) else None
61
+ content = row.get("content", "")
62
+
63
+ # Build contextualized content for FTS
64
+ if headings:
65
+ content_fts = "\n".join(headings) + "\n" + content
66
+ else:
67
+ content_fts = content
68
+
69
+ new_records.append(
70
+ ChunkRecord(
71
+ id=row.get("id"),
72
+ document_id=row.get("document_id"),
73
+ content=content,
74
+ content_fts=content_fts,
75
+ metadata=metadata_raw,
76
+ order=row.get("order", 0),
77
+ vector=row.get("vector") or [0.0] * vector_dim,
78
+ )
79
+ )
80
+
81
+ if new_records:
82
+ store.chunks_table.add(new_records)
83
+
84
+ # Drop old FTS index on content column if it exists
85
+ try:
86
+ store.chunks_table.drop_index("content_idx")
87
+ except Exception:
88
+ pass
89
+
90
+ # Create FTS index on content_fts
91
+ store.chunks_table.create_fts_index(
92
+ "content_fts", replace=True, with_position=True, remove_stop_words=False
93
+ )
94
+
95
+
96
+ upgrade_contextualize_chunks = Upgrade(
97
+ version="0.23.1",
98
+ apply=_apply_add_content_fts,
99
+ description="Add content_fts column for contextualized FTS search",
100
+ )
@@ -7,7 +7,7 @@ from haiku.rag.store.engine import Store
7
7
  from haiku.rag.store.upgrades import Upgrade
8
8
 
9
9
 
10
- def _infer_vector_dim(store: Store) -> int:
10
+ def _infer_vector_dim(store: Store) -> int: # pragma: no cover
11
11
  """Infer vector dimension from existing data; fallback to embedder config."""
12
12
  try:
13
13
  arrow = store.chunks_table.search().limit(1).to_arrow()
@@ -22,7 +22,7 @@ def _infer_vector_dim(store: Store) -> int:
22
22
  return getattr(store.embedder, "_vector_dim", 1024)
23
23
 
24
24
 
25
- def _apply_chunk_order(store: Store) -> None:
25
+ def _apply_chunk_order(store: Store) -> None: # pragma: no cover
26
26
  """Add integer 'order' column to chunks and backfill from metadata."""
27
27
 
28
28
  vector_dim = _infer_vector_dim(store)
@@ -95,7 +95,7 @@ upgrade_order = Upgrade(
95
95
  )
96
96
 
97
97
 
98
- def _apply_fts_phrase_support(store: Store) -> None:
98
+ def _apply_fts_phrase_support(store: Store) -> None: # pragma: no cover
99
99
  """Recreate FTS index with phrase query support and no stop-word removal."""
100
100
  try:
101
101
  store.chunks_table.create_fts_index(