kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,1841 @@
1
+ import asyncio
2
+ import os
3
+ import random
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import Callable, List
7
+ from unittest.mock import patch
8
+
9
+ import pytest
10
+ from llama_index.core.schema import MetadataMode, NodeRelationship
11
+ from llama_index.core.vector_stores.types import VectorStoreQueryResult
12
+ from llama_index.vector_stores.lancedb.base import TableNotFoundError
13
+
14
+ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
15
+ DocumentWithChunksAndEmbeddings,
16
+ SearchResult,
17
+ VectorStoreQuery,
18
+ )
19
+ from kiln_ai.adapters.vector_store.lancedb_adapter import LanceDBAdapter
20
+ from kiln_ai.adapters.vector_store.vector_store_registry import (
21
+ vector_store_adapter_for_config,
22
+ )
23
+ from kiln_ai.datamodel.basemodel import KilnAttachmentModel
24
+ from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
25
+ from kiln_ai.datamodel.datamodel_enums import ModelProviderName
26
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding, EmbeddingConfig
27
+ from kiln_ai.datamodel.rag import RagConfig
28
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
29
+ from kiln_ai.utils.config import Config
30
+
31
+
32
+ def get_all_nodes(adapter: LanceDBAdapter) -> List[SearchResult]:
33
+ nodes = adapter.lancedb_vector_store.get_nodes()
34
+ return [
35
+ SearchResult(
36
+ document_id=node.metadata["kiln_doc_id"],
37
+ chunk_idx=node.metadata["kiln_chunk_idx"],
38
+ chunk_text=node.get_content(MetadataMode.NONE),
39
+ similarity=None,
40
+ )
41
+ for node in nodes
42
+ ]
43
+
44
+
45
+ @pytest.fixture(autouse=True)
46
+ def patch_settings_dir(tmp_path):
47
+ with patch("kiln_ai.utils.config.Config.settings_dir", return_value=tmp_path):
48
+ yield
49
+
50
+
51
+ @pytest.fixture
52
+ def hybrid_vector_store_config():
53
+ """Create a vector store config for testing."""
54
+ return VectorStoreConfig(
55
+ name="test_config",
56
+ store_type=VectorStoreType.LANCE_DB_HYBRID,
57
+ properties={
58
+ "similarity_top_k": 10,
59
+ "nprobes": 10,
60
+ "overfetch_factor": 10,
61
+ "vector_column_name": "vector",
62
+ "text_key": "text",
63
+ "doc_id_key": "doc_id",
64
+ },
65
+ )
66
+
67
+
68
+ @pytest.fixture
69
+ def fts_vector_store_config():
70
+ """Create a vector store config for testing."""
71
+ return VectorStoreConfig(
72
+ name="test_config",
73
+ store_type=VectorStoreType.LANCE_DB_FTS,
74
+ properties={
75
+ "similarity_top_k": 10,
76
+ "overfetch_factor": 10,
77
+ "vector_column_name": "vector",
78
+ "text_key": "text",
79
+ "doc_id_key": "doc_id",
80
+ },
81
+ )
82
+
83
+
84
+ @pytest.fixture
85
+ def knn_vector_store_config():
86
+ """Create a vector store config for testing."""
87
+ return VectorStoreConfig(
88
+ name="test_config",
89
+ store_type=VectorStoreType.LANCE_DB_VECTOR,
90
+ properties={
91
+ "similarity_top_k": 10,
92
+ "nprobes": 10,
93
+ "overfetch_factor": 10,
94
+ "vector_column_name": "vector",
95
+ "text_key": "text",
96
+ "doc_id_key": "doc_id",
97
+ },
98
+ )
99
+
100
+
101
+ @pytest.fixture
102
+ def embedding_config():
103
+ """Create an embedding config for testing."""
104
+ return EmbeddingConfig(
105
+ name="test_embedding",
106
+ model_provider_name=ModelProviderName.openai,
107
+ model_name="text-embedding-ada-002",
108
+ properties={},
109
+ )
110
+
111
+
112
+ @pytest.fixture
113
+ def create_rag_config_factory() -> Callable[
114
+ [VectorStoreConfig, EmbeddingConfig], RagConfig
115
+ ]:
116
+ def create_rag_config(
117
+ vector_store_config: VectorStoreConfig, embedding_config: EmbeddingConfig
118
+ ) -> RagConfig:
119
+ return RagConfig(
120
+ name="test_rag",
121
+ tool_name="test_rag_tool",
122
+ tool_description="A test RAG tool for vector search",
123
+ extractor_config_id="test_extractor",
124
+ chunker_config_id="test_chunker",
125
+ embedding_config_id=embedding_config.id,
126
+ vector_store_config_id=vector_store_config.id,
127
+ )
128
+
129
+ return create_rag_config
130
+
131
+
132
+ def dicts_to_indexable_docs(
133
+ docs: dict[str, list[dict[str, str | list[float]]]], tmp_path: Path
134
+ ) -> list[DocumentWithChunksAndEmbeddings]:
135
+ results = []
136
+ for doc_id, doc in docs.items():
137
+ chunked_documents = ChunkedDocument(
138
+ chunker_config_id="test_chunker",
139
+ chunks=[],
140
+ path=tmp_path / "chunked_document.kiln",
141
+ )
142
+ chunk_embeddings = ChunkEmbeddings(
143
+ embedding_config_id="test_embedding",
144
+ embeddings=[],
145
+ path=tmp_path / "chunk_embeddings.kiln",
146
+ )
147
+ for part in doc:
148
+ # Ensure vector is a list of floats
149
+ vector = part["vector"]
150
+ if isinstance(vector, list):
151
+ vector = [float(x) for x in vector]
152
+ else:
153
+ vector = [float(vector)]
154
+
155
+ chunk_embeddings.embeddings.append(Embedding(vector=vector))
156
+ chunked_documents.chunks.append(
157
+ Chunk(
158
+ content=KilnAttachmentModel.from_data(
159
+ str(part["text"]),
160
+ "text/plain",
161
+ )
162
+ )
163
+ )
164
+ results.append(
165
+ DocumentWithChunksAndEmbeddings(
166
+ document_id=doc_id,
167
+ chunked_document=chunked_documents,
168
+ chunk_embeddings=chunk_embeddings,
169
+ )
170
+ )
171
+
172
+ return results
173
+
174
+
175
+ @pytest.fixture
176
+ def mock_chunked_documents(tmp_path):
177
+ """Create sample chunks for testing."""
178
+ docs: dict[str, list[dict[str, str | list[float]]]] = {
179
+ "doc_001": [
180
+ {
181
+ "vector": [1.1, 1.2],
182
+ "text": "The population of Tokyo, Japan is approximately 37 million people",
183
+ },
184
+ {
185
+ "vector": [0.2, 1.8],
186
+ "text": "New York City, USA has a population of about 8.8 million residents",
187
+ },
188
+ {
189
+ "vector": [0.45452, 51.8],
190
+ "text": "London, UK has a population of roughly 9 million people",
191
+ },
192
+ {
193
+ "vector": [0.7, 0.8],
194
+ "text": "Rio de Janeiro, Brazil has a population of about 6.7 million residents",
195
+ },
196
+ ],
197
+ "doc_002": [
198
+ {
199
+ "vector": [50.0, 50.0],
200
+ "text": "The area of Tokyo, Japan is approximately 2,191 square kilometers",
201
+ },
202
+ {
203
+ "vector": [55.0, 55.0],
204
+ "text": "The area of New York City, USA is approximately 783.8 square kilometers",
205
+ },
206
+ {
207
+ "vector": [60.0, 60.0],
208
+ "text": "The area of London, UK is approximately 1,572 square kilometers",
209
+ },
210
+ {
211
+ "vector": [65.0, 65.0],
212
+ "text": "The area of Rio de Janeiro, Brazil is approximately 1,256 square kilometers",
213
+ },
214
+ ],
215
+ }
216
+
217
+ return dicts_to_indexable_docs(docs, tmp_path)
218
+
219
+
220
+ @pytest.mark.asyncio
221
+ async def test_add_chunks_with_embeddings_and_similarity_search(
222
+ knn_vector_store_config,
223
+ mock_chunked_documents,
224
+ embedding_config,
225
+ create_rag_config_factory,
226
+ ):
227
+ """Test adding chunks and similarity search."""
228
+
229
+ rag_config = create_rag_config_factory(knn_vector_store_config, embedding_config)
230
+
231
+ # Create adapter using the registry
232
+ adapter = await vector_store_adapter_for_config(rag_config, knn_vector_store_config)
233
+
234
+ # Add chunks to the vector store
235
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
236
+
237
+ # Test similarity search - search for a vector close to [55.0, 55.0] (NYC area chunk)
238
+ query_vector = [55.0, 55.0]
239
+
240
+ results = await adapter.search(VectorStoreQuery(query_embedding=query_vector))
241
+
242
+ # The closest should be NYC area chunk with vector [55.0, 55.0]
243
+ assert len(results) > 0
244
+ assert "New York City" in results[0].chunk_text
245
+ assert "783.8 square kilometers" in results[0].chunk_text
246
+
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_fts_search(
250
+ fts_vector_store_config,
251
+ mock_chunked_documents,
252
+ embedding_config,
253
+ create_rag_config_factory,
254
+ ):
255
+ """Test full-text search functionality."""
256
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
257
+
258
+ adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
259
+
260
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
261
+
262
+ assert isinstance(adapter, LanceDBAdapter)
263
+
264
+ # Test FTS search for "London"
265
+ query_text = "london"
266
+
267
+ results = await adapter.search(VectorStoreQuery(query_string=query_text))
268
+
269
+ # Should find both London chunks
270
+ assert len(results) >= 2
271
+ london_texts = [result.chunk_text for result in results]
272
+ assert any("London, UK has a population" in text for text in london_texts)
273
+ assert any("The area of London, UK" in text for text in london_texts)
274
+
275
+
276
+ @pytest.mark.asyncio
277
+ async def test_hybrid_search(
278
+ hybrid_vector_store_config,
279
+ mock_chunked_documents,
280
+ embedding_config,
281
+ create_rag_config_factory,
282
+ ):
283
+ """Test hybrid search combining vector and text search."""
284
+ rag_config = create_rag_config_factory(hybrid_vector_store_config, embedding_config)
285
+
286
+ adapter = await vector_store_adapter_for_config(
287
+ rag_config, hybrid_vector_store_config
288
+ )
289
+
290
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
291
+
292
+ # Test hybrid search - combine text "Tokyo" with vector close to Tokyo population vector [1.1, 1.2]
293
+ query_text = "Tokyo"
294
+ query_vector = [1.1, 1.2]
295
+
296
+ results = await adapter.search(
297
+ VectorStoreQuery(query_string=query_text, query_embedding=query_vector)
298
+ )
299
+
300
+ # Should find Tokyo-related chunks, with population chunk being highly ranked
301
+ assert len(results) > 0
302
+ tokyo_results = [result for result in results if "Tokyo" in result.chunk_text]
303
+ assert len(tokyo_results) >= 2 # Both Tokyo chunks should be found
304
+
305
+
306
+ async def test_upsert_behavior(
307
+ fts_vector_store_config,
308
+ mock_chunked_documents,
309
+ embedding_config,
310
+ create_rag_config_factory,
311
+ ):
312
+ """Test that adding the same chunks multiple times works (upsert behavior)."""
313
+
314
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
315
+
316
+ adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
317
+
318
+ # Extract first document only
319
+ first_doc = [mock_chunked_documents[0]]
320
+
321
+ await adapter.add_chunks_with_embeddings(first_doc)
322
+
323
+ # Search to verify it's there
324
+ results1 = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
325
+
326
+ # Add the same document again
327
+ await adapter.add_chunks_with_embeddings(first_doc)
328
+
329
+ # Search again - should still find the same chunks (not duplicated)
330
+ results2 = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
331
+
332
+ # Should find Tokyo chunks but behavior may vary based on LanceDB implementation
333
+ assert len(results2) == len(results1)
334
+
335
+ # Add all documents
336
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
337
+
338
+ # Final search
339
+ results3 = await adapter.search(VectorStoreQuery(query_string="population"))
340
+
341
+ assert len(results3) > 0
342
+
343
+
344
+ @pytest.mark.asyncio
345
+ async def test_count_records_empty_store(
346
+ fts_vector_store_config, embedding_config, create_rag_config_factory
347
+ ):
348
+ """Test counting records in an empty vector store."""
349
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
350
+
351
+ adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
352
+
353
+ assert await adapter.count_records() == 0
354
+
355
+
356
+ @pytest.mark.asyncio
357
+ async def test_count_records_with_data(
358
+ fts_vector_store_config,
359
+ mock_chunked_documents,
360
+ embedding_config,
361
+ create_rag_config_factory,
362
+ ):
363
+ """Test counting records after adding data."""
364
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
365
+
366
+ adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
367
+
368
+ # Add chunks first to create the table
369
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
370
+
371
+ # Should now have records (8 chunks total across both documents)
372
+ final_count = await adapter.count_records()
373
+ assert final_count == 8
374
+
375
+
376
+ @pytest.mark.asyncio
377
+ async def test_get_all_chunks(
378
+ fts_vector_store_config,
379
+ mock_chunked_documents,
380
+ embedding_config,
381
+ create_rag_config_factory,
382
+ ):
383
+ """Test getting all chunks from the vector store."""
384
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
385
+
386
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
387
+
388
+ # Add chunks first to create the table
389
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
390
+
391
+ # Get all chunks
392
+ all_chunks = get_all_nodes(adapter)
393
+ assert len(all_chunks) == 8 # 8 chunks total
394
+
395
+ # Verify structure
396
+ for chunk in all_chunks:
397
+ assert chunk.document_id in ["doc_001", "doc_002"]
398
+ assert len(chunk.chunk_text) > 0
399
+ assert chunk.similarity is None # get_all_chunks doesn't include similarity
400
+
401
+
402
+ def test_format_query_result_error_conditions(
403
+ fts_vector_store_config, embedding_config, create_rag_config_factory
404
+ ):
405
+ """Test error handling in format_query_result method."""
406
+
407
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
408
+
409
+ # Create adapter with minimal setup
410
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
411
+
412
+ # Test with None ids - should return empty list instead of raising error
413
+ query_result = VectorStoreQueryResult(ids=None, nodes=[], similarities=[])
414
+ result = adapter.format_query_result(query_result)
415
+ assert result == []
416
+
417
+ # Test with None nodes - should return empty list instead of raising error
418
+ query_result = VectorStoreQueryResult(ids=[], nodes=None, similarities=[])
419
+ result = adapter.format_query_result(query_result)
420
+ assert result == []
421
+
422
+ # Test with None similarities - should return empty list instead of raising error
423
+ query_result = VectorStoreQueryResult(ids=[], nodes=[], similarities=None)
424
+ result = adapter.format_query_result(query_result)
425
+ assert result == []
426
+
427
+ # Test with empty lists - should return empty list (valid empty result)
428
+ query_result = VectorStoreQueryResult(ids=[], nodes=[], similarities=[])
429
+ result = adapter.format_query_result(query_result)
430
+ assert result == []
431
+
432
+ # Test with mismatched lengths where some arrays are empty - should return empty list
433
+ query_result = VectorStoreQueryResult(ids=["1", "2"], nodes=[], similarities=[])
434
+ with pytest.raises(
435
+ ValueError, match="ids, nodes, and similarities must have the same length"
436
+ ):
437
+ adapter.format_query_result(query_result)
438
+
439
+ # Test with mismatched lengths where all arrays are non-empty - should raise ValueError
440
+ from llama_index.core.schema import TextNode
441
+
442
+ node1 = TextNode(text="test1")
443
+ query_result = VectorStoreQueryResult(
444
+ ids=["1", "2"], nodes=[node1], similarities=[0.5, 0.3]
445
+ )
446
+ with pytest.raises(
447
+ ValueError, match="ids, nodes, and similarities must have the same length"
448
+ ):
449
+ adapter.format_query_result(query_result)
450
+
451
+
452
+ def test_build_kwargs_for_query_validation_errors(
453
+ create_rag_config_factory,
454
+ hybrid_vector_store_config,
455
+ fts_vector_store_config,
456
+ knn_vector_store_config,
457
+ embedding_config,
458
+ ):
459
+ """Test error handling in build_kwargs_for_query method."""
460
+
461
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
462
+
463
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
464
+
465
+ # Test FTS search without query_string
466
+ query = VectorStoreQuery(query_string=None, query_embedding=None)
467
+ with pytest.raises(
468
+ ValueError, match="query_string must be provided for fts search"
469
+ ):
470
+ adapter.build_kwargs_for_query(query)
471
+
472
+ # Test HYBRID search without required parameters
473
+ adapter = LanceDBAdapter(rag_config, hybrid_vector_store_config)
474
+
475
+ query = VectorStoreQuery(query_string=None, query_embedding=[1.0, 2.0])
476
+ with pytest.raises(
477
+ ValueError,
478
+ match="query_string and query_embedding must be provided for hybrid search",
479
+ ):
480
+ adapter.build_kwargs_for_query(query)
481
+
482
+ query = VectorStoreQuery(query_string="test", query_embedding=None)
483
+ with pytest.raises(
484
+ ValueError,
485
+ match="query_string and query_embedding must be provided for hybrid search",
486
+ ):
487
+ adapter.build_kwargs_for_query(query)
488
+
489
+ # Test VECTOR search without embedding
490
+ adapter = LanceDBAdapter(rag_config, knn_vector_store_config)
491
+
492
+ query = VectorStoreQuery(query_string=None, query_embedding=None)
493
+ with pytest.raises(
494
+ ValueError, match="query_embedding must be provided for vector search"
495
+ ):
496
+ adapter.build_kwargs_for_query(query)
497
+
498
+
499
+ @pytest.mark.asyncio
500
+ async def test_search_with_table_not_found_error(
501
+ fts_vector_store_config, embedding_config, create_rag_config_factory
502
+ ):
503
+ """Test that search handles TableNotFoundError gracefully"""
504
+
505
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
506
+
507
+ # Create the adapter normally
508
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
509
+
510
+ # Mock the aquery method directly on the LanceDBVectorStore class
511
+ with patch.object(adapter.lancedb_vector_store.__class__, "aquery") as mock_aquery:
512
+ mock_aquery.side_effect = TableNotFoundError("Table vectors is not initialized")
513
+
514
+ # Search should return empty list instead of raising error
515
+ query = VectorStoreQuery(query_string="test query")
516
+ results = await adapter.search(query)
517
+
518
+ assert results == []
519
+
520
+
521
+ @pytest.mark.asyncio
522
+ async def test_search_with_empty_results_error(
523
+ fts_vector_store_config,
524
+ embedding_config,
525
+ create_rag_config_factory,
526
+ ):
527
+ """Test that search handles 'query results are empty' error gracefully"""
528
+
529
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
530
+
531
+ # Create the adapter normally
532
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
533
+
534
+ # Search should return empty list instead of raising error
535
+ query = VectorStoreQuery(query_string="test query")
536
+ results = await adapter.search(query)
537
+
538
+ assert results == []
539
+
540
+
541
+ async def test_destroy(
542
+ fts_vector_store_config,
543
+ mock_chunked_documents,
544
+ embedding_config,
545
+ create_rag_config_factory,
546
+ ):
547
+ """Test the destroy method removes the database directory."""
548
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
549
+
550
+ adapter = LanceDBAdapter(
551
+ rag_config,
552
+ fts_vector_store_config,
553
+ )
554
+
555
+ # Add some data to create the database
556
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
557
+
558
+ # Verify data exists
559
+ count = await adapter.count_records()
560
+ assert count == 8
561
+
562
+ # Get the database path
563
+ db_path = LanceDBAdapter.lancedb_path_for_config(rag_config)
564
+ assert os.path.exists(db_path)
565
+
566
+ # Destroy the database
567
+ await adapter.destroy()
568
+
569
+ # Verify the database directory is gone
570
+ assert not os.path.exists(db_path)
571
+
572
+
573
+ def test_lancedb_path_for_config():
574
+ """Test the lancedb_path_for_config static method."""
575
+ # Test with valid rag_config
576
+ rag_config = RagConfig(
577
+ name="test_rag",
578
+ tool_name="test_rag_tool",
579
+ tool_description="A test RAG tool for path testing",
580
+ extractor_config_id="test_extractor",
581
+ chunker_config_id="test_chunker",
582
+ embedding_config_id="test_embedding",
583
+ vector_store_config_id="test_vector_store",
584
+ )
585
+
586
+ expected_path = str(
587
+ Path(Config.settings_dir()) / "rag_indexes" / "lancedb" / str(rag_config.id)
588
+ )
589
+ actual_path = LanceDBAdapter.lancedb_path_for_config(rag_config)
590
+
591
+ assert actual_path == expected_path
592
+
593
+ # Test with rag_config with no ID (should raise ValueError)
594
+ rag_config_no_id = RagConfig(
595
+ name="test_rag",
596
+ tool_name="test_rag_tool",
597
+ tool_description="A test RAG tool with no ID",
598
+ extractor_config_id="test_extractor",
599
+ chunker_config_id="test_chunker",
600
+ embedding_config_id="test_embedding",
601
+ vector_store_config_id="test_vector_store",
602
+ )
603
+ rag_config_no_id.id = None
604
+
605
+ with pytest.raises(ValueError, match="Vector store config ID is required"):
606
+ LanceDBAdapter.lancedb_path_for_config(rag_config_no_id)
607
+
608
+
609
+ def test_query_type_property(
610
+ embedding_config,
611
+ create_rag_config_factory,
612
+ ):
613
+ """Test the query_type property returns correct values for different store types."""
614
+
615
+ # Test FTS query type
616
+ fts_config = VectorStoreConfig(
617
+ name="fts_test",
618
+ store_type=VectorStoreType.LANCE_DB_FTS,
619
+ properties={
620
+ "similarity_top_k": 10,
621
+ "overfetch_factor": 10,
622
+ "vector_column_name": "vector",
623
+ "text_key": "text",
624
+ "doc_id_key": "doc_id",
625
+ },
626
+ )
627
+ rag_config = create_rag_config_factory(fts_config, embedding_config)
628
+
629
+ adapter = LanceDBAdapter(rag_config, fts_config)
630
+ assert adapter.query_type == "fts"
631
+
632
+ # Test Hybrid query type
633
+ hybrid_config = VectorStoreConfig(
634
+ name="hybrid_test",
635
+ store_type=VectorStoreType.LANCE_DB_HYBRID,
636
+ properties={
637
+ "similarity_top_k": 10,
638
+ "nprobes": 10,
639
+ "overfetch_factor": 10,
640
+ "vector_column_name": "vector",
641
+ "text_key": "text",
642
+ "doc_id_key": "doc_id",
643
+ },
644
+ )
645
+ rag_config = create_rag_config_factory(hybrid_config, embedding_config)
646
+
647
+ adapter = LanceDBAdapter(rag_config, hybrid_config)
648
+ assert adapter.query_type == "hybrid"
649
+
650
+ # Test Vector query type
651
+ vector_config = VectorStoreConfig(
652
+ name="vector_test",
653
+ store_type=VectorStoreType.LANCE_DB_VECTOR,
654
+ properties={
655
+ "similarity_top_k": 10,
656
+ "nprobes": 10,
657
+ "overfetch_factor": 10,
658
+ "vector_column_name": "vector",
659
+ "text_key": "text",
660
+ "doc_id_key": "doc_id",
661
+ },
662
+ )
663
+ rag_config = create_rag_config_factory(vector_config, embedding_config)
664
+
665
+ adapter = LanceDBAdapter(rag_config, vector_config)
666
+ assert adapter.query_type == "vector"
667
+
668
+
669
+ @pytest.mark.asyncio
670
+ async def test_adapter_reuse_preserves_data(
671
+ fts_vector_store_config,
672
+ mock_chunked_documents,
673
+ embedding_config,
674
+ create_rag_config_factory,
675
+ ):
676
+ """Test that creating the same LanceDBAdapter twice doesn't destroy/empty the db."""
677
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
678
+
679
+ # Create first adapter and add data
680
+ adapter1 = LanceDBAdapter(rag_config, fts_vector_store_config)
681
+ await adapter1.add_chunks_with_embeddings([mock_chunked_documents[0]])
682
+
683
+ # Verify data exists
684
+ count1 = await adapter1.count_records()
685
+ assert count1 == 4
686
+
687
+ # Create second adapter with same config
688
+ adapter2 = LanceDBAdapter(rag_config, fts_vector_store_config)
689
+ await adapter2.add_chunks_with_embeddings([mock_chunked_documents[1]])
690
+
691
+ # Verify data still exists and wasn't destroyed by second instantiation
692
+ count2 = await adapter2.count_records()
693
+ assert count2 == 8
694
+
695
+ # interesting: adapter1 is no longer usable after creating adapter2
696
+ # with pytest.raises(
697
+ # Exception,
698
+ # match="lance error: Retryable commit conflict for version 4: This CreateIndex transaction was preempted by concurrent transaction Rewrite at version 4. Please retry.",
699
+ # ):
700
+ await adapter1.search(VectorStoreQuery(query_string="Tokyo"))
701
+
702
+ # but we can query adapter2
703
+ results2 = await adapter2.search(VectorStoreQuery(query_string="Tokyo"))
704
+ assert len(results2) > 0
705
+
706
+
707
+ @pytest.mark.asyncio
708
+ async def test_skip_existing_chunks_when_count_matches(
709
+ fts_vector_store_config,
710
+ mock_chunked_documents,
711
+ embedding_config,
712
+ create_rag_config_factory,
713
+ ):
714
+ """Test that chunks already in DB are skipped when they match incoming chunks count."""
715
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
716
+
717
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
718
+
719
+ # Add first document
720
+ first_doc = [mock_chunked_documents[0]] # doc_001 with 4 chunks
721
+ await adapter.add_chunks_with_embeddings(first_doc)
722
+
723
+ # Verify it was added
724
+ count_after_first = await adapter.count_records()
725
+ assert count_after_first == 4
726
+
727
+ # Try to add the same document again - should be skipped
728
+ await adapter.add_chunks_with_embeddings(first_doc)
729
+
730
+ # Count should remain the same (chunks were skipped)
731
+ count_after_second = await adapter.count_records()
732
+ assert count_after_second == 4
733
+
734
+ # Verify the chunks are still there and retrievable
735
+ results = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
736
+ assert len(results) > 0
737
+ assert "Tokyo" in results[0].chunk_text
738
+
739
+
740
+ @pytest.mark.asyncio
741
+ async def test_batching_functionality(
742
+ fts_vector_store_config,
743
+ embedding_config,
744
+ create_rag_config_factory,
745
+ tmp_path,
746
+ ):
747
+ """Test basic batching functionality in add_chunks_with_embeddings."""
748
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
749
+
750
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
751
+
752
+ # Create a document with many chunks to test batching
753
+ large_doc_data = {
754
+ "large_doc": [
755
+ {"vector": [i * 0.1, i * 0.2], "text": f"Chunk {i} content"}
756
+ for i in range(15) # 15 chunks to test batching
757
+ ]
758
+ }
759
+
760
+ large_doc_records = dicts_to_indexable_docs(large_doc_data, tmp_path)
761
+
762
+ # Track batch sizes by patching the insert method
763
+ batch_sizes = []
764
+
765
+ async def mock_async_add(self, nodes, **kwargs):
766
+ batch_sizes.append(len(nodes))
767
+ return self.add(nodes, **kwargs)
768
+
769
+ # Patch the async_add method at the class level
770
+ with patch.object(
771
+ adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
772
+ ):
773
+ # Add with small batch size to force batching
774
+ await adapter.add_chunks_with_embeddings(large_doc_records, nodes_batch_size=5)
775
+
776
+ # Verify batching behavior
777
+ # With 15 chunks and batch_size=5, we expect 3 batches of 5 chunks each
778
+ expected_batch_sizes = [5, 5, 5]
779
+ assert batch_sizes == expected_batch_sizes, (
780
+ f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
781
+ )
782
+
783
+ # Verify all chunks were added
784
+ count = await adapter.count_records()
785
+ assert count == 15
786
+
787
+ # Verify we can search and find chunks
788
+ results = await adapter.search(VectorStoreQuery(query_string="Chunk"))
789
+ assert len(results) > 0 # Should find chunks containing "Chunk"
790
+ assert len(results) <= 15 # Should not exceed total number of chunks
791
+
792
+
793
+ @pytest.mark.asyncio
794
+ async def test_batching_functionality_with_remainder(
795
+ fts_vector_store_config,
796
+ embedding_config,
797
+ create_rag_config_factory,
798
+ tmp_path,
799
+ ):
800
+ """Test batching functionality with a remainder (not evenly divisible)."""
801
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
802
+
803
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
804
+
805
+ # Create a document with 17 chunks to test batching with remainder
806
+ large_doc_data = {
807
+ "large_doc": [
808
+ {"vector": [i * 0.1, i * 0.2], "text": f"Chunk {i} content"}
809
+ for i in range(17) # 17 chunks to test batching with remainder
810
+ ]
811
+ }
812
+
813
+ large_doc_records = dicts_to_indexable_docs(large_doc_data, tmp_path)
814
+
815
+ # Track batch sizes by patching the insert method
816
+ batch_sizes = []
817
+
818
+ async def mock_async_add(self, nodes, **kwargs):
819
+ batch_sizes.append(len(nodes))
820
+ return self.add(nodes, **kwargs)
821
+
822
+ # Patch the async_add method at the class level
823
+ with patch.object(
824
+ adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
825
+ ):
826
+ # Add with batch_size=7 to get 2 full batches + 1 remainder batch
827
+ await adapter.add_chunks_with_embeddings(large_doc_records, nodes_batch_size=7)
828
+
829
+ # Verify batching behavior
830
+ # With 17 chunks and batch_size=7, we expect 2 batches of 7 and 1 batch of 3
831
+ expected_batch_sizes = [7, 7, 3]
832
+ assert batch_sizes == expected_batch_sizes, (
833
+ f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
834
+ )
835
+
836
+ # Verify all chunks were added
837
+ count = await adapter.count_records()
838
+ assert count == 17
839
+
840
+
841
+ @pytest.mark.asyncio
842
+ async def test_batching_functionality_edge_cases(
843
+ fts_vector_store_config,
844
+ embedding_config,
845
+ create_rag_config_factory,
846
+ tmp_path,
847
+ ):
848
+ """Test batching functionality edge cases (small batches, single batch)."""
849
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
850
+
851
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
852
+
853
+ # Test 1: Single batch (3 chunks with batch_size=10)
854
+ small_doc_data = {
855
+ "small_doc": [
856
+ {"vector": [i * 0.1, i * 0.2], "text": f"Small chunk {i} content"}
857
+ for i in range(3)
858
+ ]
859
+ }
860
+
861
+ small_doc_records = dicts_to_indexable_docs(small_doc_data, tmp_path)
862
+
863
+ # Track batch sizes by patching the insert method
864
+ batch_sizes = []
865
+
866
+ async def mock_async_add(self, nodes, **kwargs):
867
+ batch_sizes.append(len(nodes))
868
+ return self.add(nodes, **kwargs)
869
+
870
+ # Test single batch scenario
871
+ with patch.object(
872
+ adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
873
+ ):
874
+ await adapter.add_chunks_with_embeddings(small_doc_records, nodes_batch_size=10)
875
+
876
+ # With 3 chunks and batch_size=10, we expect 1 batch of 3 chunks
877
+ expected_batch_sizes = [3]
878
+ assert batch_sizes == expected_batch_sizes, (
879
+ f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
880
+ )
881
+
882
+ # Verify all chunks were added
883
+ count = await adapter.count_records()
884
+ assert count == 3
885
+
886
+ # Test 2: Very small batches (batch_size=1)
887
+ batch_sizes.clear() # Reset for next test
888
+
889
+ # Create new rag_config to get a fresh database
890
+ rag_config2 = create_rag_config_factory(fts_vector_store_config, embedding_config)
891
+ adapter2 = LanceDBAdapter(rag_config2, fts_vector_store_config)
892
+
893
+ with patch.object(
894
+ adapter2.lancedb_vector_store.__class__, "async_add", mock_async_add
895
+ ):
896
+ await adapter2.add_chunks_with_embeddings(small_doc_records, nodes_batch_size=1)
897
+
898
+ # With 3 chunks and batch_size=1, we expect 3 batches of 1 chunk each
899
+ expected_batch_sizes = [1, 1, 1]
900
+ assert batch_sizes == expected_batch_sizes, (
901
+ f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
902
+ )
903
+
904
+
905
+ @pytest.mark.asyncio
906
+ async def test_get_nodes_by_ids_functionality(
907
+ fts_vector_store_config,
908
+ mock_chunked_documents,
909
+ embedding_config,
910
+ create_rag_config_factory,
911
+ tmp_path,
912
+ ):
913
+ """Test get_nodes_by_ids method functionality."""
914
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
915
+
916
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
917
+
918
+ # before inserting data, we should simply return an empty list
919
+ retrieved_nodes_before_any_insert = await adapter.get_nodes_by_ids(
920
+ [str(uuid.uuid4()), str(uuid.uuid4())]
921
+ )
922
+ assert len(retrieved_nodes_before_any_insert) == 0
923
+
924
+ # Add some data
925
+ await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
926
+
927
+ # Test getting nodes by IDs - compute expected IDs
928
+ expected_ids = [
929
+ adapter.compute_deterministic_chunk_id("doc_001", i) for i in range(4)
930
+ ]
931
+
932
+ # Get nodes by IDs
933
+ retrieved_nodes = await adapter.get_nodes_by_ids(expected_ids)
934
+
935
+ # Should retrieve all 4 nodes
936
+ assert len(retrieved_nodes) == 4
937
+
938
+ # Verify node properties
939
+ for i, node in enumerate(retrieved_nodes):
940
+ assert node.id_ == expected_ids[i]
941
+ assert node.metadata["kiln_doc_id"] == "doc_001"
942
+ assert node.metadata["kiln_chunk_idx"] == i
943
+ assert len(node.get_content()) > 0
944
+
945
+ # Test with non-existent IDs
946
+ fake_ids = [adapter.compute_deterministic_chunk_id("fake_doc", i) for i in range(2)]
947
+ retrieved_fake = await adapter.get_nodes_by_ids(fake_ids)
948
+ assert len(retrieved_fake) == 0
949
+
950
+ # Test with empty table (no table exists yet)
951
+ empty_rag_config = create_rag_config_factory(
952
+ fts_vector_store_config, embedding_config
953
+ )
954
+ empty_adapter = LanceDBAdapter(empty_rag_config, fts_vector_store_config)
955
+ empty_result = await empty_adapter.get_nodes_by_ids(expected_ids)
956
+ assert len(empty_result) == 0
957
+
958
+
959
+ @pytest.mark.asyncio
960
+ async def test_delete_nodes_by_document_id(
961
+ fts_vector_store_config,
962
+ mock_chunked_documents,
963
+ embedding_config,
964
+ create_rag_config_factory,
965
+ ):
966
+ """Test delete_nodes_by_document_id method."""
967
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
968
+
969
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
970
+
971
+ # Add both documents
972
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
973
+
974
+ # Verify both documents are there
975
+ count_before = await adapter.count_records()
976
+ assert count_before == 8 # 4 chunks per document
977
+
978
+ # Delete nodes for doc_001
979
+ await adapter.delete_nodes_by_document_id("doc_001")
980
+
981
+ # Verify doc_001 chunks are gone
982
+ count_after = await adapter.count_records()
983
+ assert count_after == 4 # Only doc_002 chunks remain
984
+
985
+ # Verify we can still find doc_002 chunks but not doc_001
986
+ results_doc2 = await adapter.search(VectorStoreQuery(query_string="area"))
987
+ assert len(results_doc2) > 0
988
+
989
+ # Try to search for population (which was in doc_001) - should find no results
990
+ # LanceDB raises a Warning when no results are found, so we catch it
991
+ try:
992
+ results_doc1 = await adapter.search(VectorStoreQuery(query_string="population"))
993
+ assert len(results_doc1) == 0
994
+ except Warning as w:
995
+ # This is expected - LanceDB raises a Warning for empty results
996
+ assert "query results are empty" in str(w)
997
+
998
+ # Try to delete non-existent document (should not error)
999
+ await adapter.delete_nodes_by_document_id("non_existent_doc")
1000
+ final_count = await adapter.count_records()
1001
+ assert final_count == 4 # Count unchanged
1002
+
1003
+
1004
+ @pytest.mark.asyncio
1005
+ async def test_uuid_scheme_retrieval_and_node_properties(
1006
+ fts_vector_store_config,
1007
+ mock_chunked_documents,
1008
+ embedding_config,
1009
+ create_rag_config_factory,
1010
+ ):
1011
+ """Test UUID scheme retrieval and that inserted nodes have correct ID and ref_doc_id."""
1012
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1013
+
1014
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1015
+
1016
+ # Add first document
1017
+ await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
1018
+
1019
+ # Test the UUID scheme: document_id::chunk_idx
1020
+ for chunk_idx in range(4):
1021
+ # Compute expected ID using the same scheme as the adapter
1022
+ expected_id = adapter.compute_deterministic_chunk_id("doc_001", chunk_idx)
1023
+
1024
+ # Retrieve the specific node by ID
1025
+ retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
1026
+ assert len(retrieved_nodes) == 1
1027
+
1028
+ node = retrieved_nodes[0]
1029
+
1030
+ # Test that inserted nodes have the expected ID we set
1031
+ assert node.id_ == expected_id
1032
+
1033
+ # Test that inserted nodes have ref_doc_id set correctly
1034
+ # The ref_doc_id should be set through the SOURCE relationship
1035
+ source_relationship = node.relationships.get(NodeRelationship.SOURCE)
1036
+ assert source_relationship is not None
1037
+ # Handle both single RelatedNodeInfo and list of RelatedNodeInfo
1038
+ if isinstance(source_relationship, list):
1039
+ assert len(source_relationship) > 0
1040
+ assert source_relationship[0].node_id == "doc_001"
1041
+ else:
1042
+ assert source_relationship.node_id == "doc_001"
1043
+
1044
+ # Verify other node properties
1045
+ assert node.metadata["kiln_doc_id"] == "doc_001"
1046
+ assert node.metadata["kiln_chunk_idx"] == chunk_idx
1047
+ assert len(node.get_content()) > 0
1048
+ assert node.embedding is not None
1049
+ assert len(node.embedding) == 2 # Our test embeddings are 2D
1050
+
1051
+ # Test with a different document to ensure the scheme works consistently
1052
+ await adapter.add_chunks_with_embeddings([mock_chunked_documents[1]]) # doc_002
1053
+
1054
+ # Test retrieval of doc_002 chunks
1055
+ for chunk_idx in range(4):
1056
+ expected_id = adapter.compute_deterministic_chunk_id("doc_002", chunk_idx)
1057
+ retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
1058
+ assert len(retrieved_nodes) == 1
1059
+
1060
+ node = retrieved_nodes[0]
1061
+ assert node.id_ == expected_id
1062
+ assert node.metadata["kiln_doc_id"] == "doc_002"
1063
+ assert node.metadata["kiln_chunk_idx"] == chunk_idx
1064
+
1065
+ # Check ref_doc_id relationship
1066
+ source_relationship = node.relationships.get(NodeRelationship.SOURCE)
1067
+ assert source_relationship is not None
1068
+ # Handle both single RelatedNodeInfo and list of RelatedNodeInfo
1069
+ if isinstance(source_relationship, list):
1070
+ assert len(source_relationship) > 0
1071
+ assert source_relationship[0].node_id == "doc_002"
1072
+ else:
1073
+ assert source_relationship.node_id == "doc_002"
1074
+
1075
+
1076
+ @pytest.mark.asyncio
1077
+ async def test_deterministic_chunk_id_consistency(
1078
+ fts_vector_store_config,
1079
+ embedding_config,
1080
+ create_rag_config_factory,
1081
+ ):
1082
+ """Test that the deterministic chunk ID generation is consistent."""
1083
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1084
+
1085
+ adapter = LanceDBAdapter(
1086
+ rag_config,
1087
+ fts_vector_store_config,
1088
+ )
1089
+
1090
+ # Test that the same document_id and chunk_idx always produce the same UUID
1091
+ doc_id = "test_doc_123"
1092
+ chunk_idx = 5
1093
+
1094
+ id1 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
1095
+ id2 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
1096
+
1097
+ assert id1 == id2
1098
+
1099
+ # Test that different inputs produce different UUIDs
1100
+ id3 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx + 1)
1101
+ id4 = adapter.compute_deterministic_chunk_id(doc_id + "_different", chunk_idx)
1102
+
1103
+ assert id1 != id3
1104
+ assert id1 != id4
1105
+ assert id3 != id4
1106
+
1107
+ # Verify the format is a valid UUID string
1108
+ import uuid
1109
+
1110
+ try:
1111
+ uuid.UUID(id1) # Should not raise an exception
1112
+ uuid.UUID(id3)
1113
+ uuid.UUID(id4)
1114
+ except ValueError:
1115
+ pytest.fail("Generated IDs are not valid UUIDs")
1116
+
1117
+
1118
+ @pytest.mark.asyncio
1119
+ async def test_chunk_replacement_triggers_deletion(
1120
+ fts_vector_store_config,
1121
+ embedding_config,
1122
+ create_rag_config_factory,
1123
+ tmp_path,
1124
+ ):
1125
+ """Test that adding different chunks for the same document triggers deletion of old chunks."""
1126
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1127
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1128
+
1129
+ # Create initial document with 2 chunks
1130
+ initial_doc_data = {
1131
+ "test_doc": [
1132
+ {"vector": [1.0, 1.0], "text": "Initial chunk 1"},
1133
+ {"vector": [2.0, 2.0], "text": "Initial chunk 2"},
1134
+ ]
1135
+ }
1136
+ initial_records = dicts_to_indexable_docs(initial_doc_data, tmp_path)
1137
+
1138
+ # Add initial chunks
1139
+ await adapter.add_chunks_with_embeddings(initial_records)
1140
+
1141
+ # Verify initial chunks are there
1142
+ initial_count = await adapter.count_records()
1143
+ assert initial_count == 2
1144
+
1145
+ # Create modified document with 3 different chunks (more chunks than original)
1146
+ # This will trigger deletion because len(chunk_ids_in_database) != chunk_count_for_document (2 != 3)
1147
+ modified_doc_data = {
1148
+ "test_doc": [
1149
+ {"vector": [10.0, 10.0], "text": "Modified chunk 1"},
1150
+ {"vector": [20.0, 20.0], "text": "Modified chunk 2"},
1151
+ {"vector": [30.0, 30.0], "text": "Modified chunk 3"},
1152
+ ]
1153
+ }
1154
+ modified_records = dicts_to_indexable_docs(modified_doc_data, tmp_path)
1155
+
1156
+ # Mock the delete_nodes_by_document_id method to verify it gets called
1157
+ delete_called = []
1158
+ original_delete = adapter.delete_nodes_by_document_id
1159
+
1160
+ async def mock_delete(document_id: str):
1161
+ delete_called.append(document_id)
1162
+ return await original_delete(document_id)
1163
+
1164
+ adapter.delete_nodes_by_document_id = mock_delete
1165
+
1166
+ # Add modified chunks - this should trigger deletion of old chunks
1167
+ await adapter.add_chunks_with_embeddings(modified_records)
1168
+
1169
+ # Verify delete was called for the document
1170
+ assert "test_doc" in delete_called
1171
+
1172
+ # Verify final count is correct (only 2 new chunks)
1173
+ final_count = await adapter.count_records()
1174
+ assert final_count == 3
1175
+
1176
+ # Verify the chunks are the new ones, not the old ones
1177
+ results = await adapter.search(VectorStoreQuery(query_string="Modified"))
1178
+ assert len(results) == 3
1179
+ assert all("Modified" in result.chunk_text for result in results)
1180
+
1181
+ # Verify old chunks are gone - LanceDB raises a Warning for empty results
1182
+ try:
1183
+ old_results = await adapter.search(VectorStoreQuery(query_string="Initial"))
1184
+ assert len(old_results) == 0
1185
+ except Warning as w:
1186
+ # This is expected - LanceDB raises a Warning for empty results
1187
+ assert "query results are empty" in str(w)
1188
+
1189
+
1190
+ @pytest.mark.asyncio
1191
+ async def test_chunk_deletion_ensures_complete_cleanup_and_other_docs_unaffected(
1192
+ fts_vector_store_config,
1193
+ embedding_config,
1194
+ create_rag_config_factory,
1195
+ tmp_path,
1196
+ ):
1197
+ """Test that deletion completely cleans up all old chunks and other documents are unaffected."""
1198
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1199
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1200
+
1201
+ # Create initial document with 5 chunks
1202
+ initial_doc_data = {
1203
+ "target_doc": [
1204
+ {"vector": [1.0, 1.0], "text": "Original chunk 1"},
1205
+ {"vector": [2.0, 2.0], "text": "Original chunk 2"},
1206
+ {"vector": [3.0, 3.0], "text": "Original chunk 3"},
1207
+ {"vector": [4.0, 4.0], "text": "Original chunk 4"},
1208
+ {"vector": [5.0, 5.0], "text": "Original chunk 5"},
1209
+ ]
1210
+ }
1211
+ initial_records = dicts_to_indexable_docs(initial_doc_data, tmp_path)
1212
+
1213
+ # Create another document that should remain unaffected
1214
+ other_doc_data = {
1215
+ "other_doc": [
1216
+ {"vector": [10.0, 10.0], "text": "Other doc chunk 1"},
1217
+ {"vector": [20.0, 20.0], "text": "Other doc chunk 2"},
1218
+ {"vector": [30.0, 30.0], "text": "Other doc chunk 3"},
1219
+ ]
1220
+ }
1221
+ other_records = dicts_to_indexable_docs(other_doc_data, tmp_path)
1222
+
1223
+ # Add both documents
1224
+ await adapter.add_chunks_with_embeddings(initial_records)
1225
+ await adapter.add_chunks_with_embeddings(other_records)
1226
+
1227
+ # Verify both documents are there (5 + 3 = 8 chunks)
1228
+ initial_count = await adapter.count_records()
1229
+ assert initial_count == 8
1230
+
1231
+ # Verify we can find chunks from both documents
1232
+ target_results = await adapter.search(VectorStoreQuery(query_string="Original"))
1233
+ assert len(target_results) == 5
1234
+
1235
+ other_results = await adapter.search(VectorStoreQuery(query_string="Other"))
1236
+ assert len(other_results) == 3
1237
+
1238
+ # Create modified target document with 7 chunks (more than the original 5)
1239
+ # This will trigger deletion because len(chunk_ids_in_database) != chunk_count_for_document (5 != 7)
1240
+ # After deletion, we'll have 7 new chunks, demonstrating that the old 5 chunks were completely removed
1241
+ modified_doc_data = {
1242
+ "target_doc": [
1243
+ {"vector": [100.0, 100.0], "text": "New target chunk 1"},
1244
+ {"vector": [200.0, 200.0], "text": "New target chunk 2"},
1245
+ {"vector": [300.0, 300.0], "text": "New target chunk 3"},
1246
+ {"vector": [400.0, 400.0], "text": "New target chunk 4"},
1247
+ {"vector": [500.0, 500.0], "text": "New target chunk 5"},
1248
+ {"vector": [600.0, 600.0], "text": "New target chunk 6"},
1249
+ {"vector": [700.0, 700.0], "text": "New target chunk 7"},
1250
+ ]
1251
+ }
1252
+ modified_records = dicts_to_indexable_docs(modified_doc_data, tmp_path)
1253
+
1254
+ # Mock the delete_nodes_by_document_id method to verify it gets called
1255
+ delete_called = []
1256
+ original_delete = adapter.delete_nodes_by_document_id
1257
+
1258
+ async def mock_delete(document_id: str):
1259
+ delete_called.append(document_id)
1260
+ return await original_delete(document_id)
1261
+
1262
+ adapter.delete_nodes_by_document_id = mock_delete
1263
+
1264
+ # Add modified chunks - this should trigger deletion of old target_doc chunks only
1265
+ await adapter.add_chunks_with_embeddings(modified_records)
1266
+
1267
+ # Verify delete was called for the target document only
1268
+ assert "target_doc" in delete_called
1269
+ assert "other_doc" not in delete_called
1270
+
1271
+ # Verify final count: 7 new target chunks + 3 other chunks = 10 total
1272
+ final_count = await adapter.count_records()
1273
+ assert final_count == 10
1274
+
1275
+ # Verify the target document now has the new chunks
1276
+ new_target_results = await adapter.search(
1277
+ VectorStoreQuery(query_string="New target")
1278
+ )
1279
+ assert len(new_target_results) == 7
1280
+ assert all("New target" in result.chunk_text for result in new_target_results)
1281
+
1282
+ # Verify old target chunks are completely gone
1283
+ try:
1284
+ old_target_results = await adapter.search(
1285
+ VectorStoreQuery(query_string="Original")
1286
+ )
1287
+ # Should find no results since "Original" was only in the old chunks
1288
+ assert len(old_target_results) == 0
1289
+ except Warning as w:
1290
+ # This is expected - LanceDB raises a Warning for empty results
1291
+ assert "query results are empty" in str(w)
1292
+
1293
+ # Verify other document is completely unaffected
1294
+ final_other_results = await adapter.search(VectorStoreQuery(query_string="Other"))
1295
+ assert len(final_other_results) == 3
1296
+ assert all("Other doc" in result.chunk_text for result in final_other_results)
1297
+
1298
+ # Verify all other document chunks still have the same content
1299
+ other_texts = [result.chunk_text for result in final_other_results]
1300
+ expected_other_texts = [
1301
+ "Other doc chunk 1",
1302
+ "Other doc chunk 2",
1303
+ "Other doc chunk 3",
1304
+ ]
1305
+ for expected_text in expected_other_texts:
1306
+ assert any(expected_text in text for text in other_texts)
1307
+
1308
+
1309
+ @pytest.mark.asyncio
1310
+ async def test_delete_nodes_by_document_id_direct(
1311
+ fts_vector_store_config,
1312
+ embedding_config,
1313
+ create_rag_config_factory,
1314
+ tmp_path,
1315
+ ):
1316
+ """Test delete_nodes_by_document_id method directly."""
1317
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1318
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1319
+
1320
+ # Create two documents with multiple chunks each
1321
+ doc1_data = {
1322
+ "document_1": [
1323
+ {"vector": [1.0, 1.0], "text": "Alpha content part 1"},
1324
+ {"vector": [2.0, 2.0], "text": "Alpha content part 2"},
1325
+ {"vector": [3.0, 3.0], "text": "Alpha content part 3"},
1326
+ ]
1327
+ }
1328
+ doc1_records = dicts_to_indexable_docs(doc1_data, tmp_path)
1329
+
1330
+ doc2_data = {
1331
+ "document_2": [
1332
+ {"vector": [10.0, 10.0], "text": "Beta content section 1"},
1333
+ {"vector": [20.0, 20.0], "text": "Beta content section 2"},
1334
+ ]
1335
+ }
1336
+ doc2_records = dicts_to_indexable_docs(doc2_data, tmp_path)
1337
+
1338
+ # Add both documents
1339
+ await adapter.add_chunks_with_embeddings(doc1_records)
1340
+ await adapter.add_chunks_with_embeddings(doc2_records)
1341
+
1342
+ # Verify both documents are in the database (3 + 2 = 5 chunks)
1343
+ initial_count = await adapter.count_records()
1344
+ assert initial_count == 5
1345
+
1346
+ # Verify we can find chunks from both documents
1347
+ doc1_results = await adapter.search(VectorStoreQuery(query_string="Alpha"))
1348
+ assert len(doc1_results) == 3
1349
+
1350
+ doc2_results = await adapter.search(VectorStoreQuery(query_string="Beta"))
1351
+ assert len(doc2_results) == 2
1352
+
1353
+ # Test deleting document_1 chunks using delete_nodes_by_document_id
1354
+ await adapter.delete_nodes_by_document_id("document_1")
1355
+
1356
+ # Verify document_1 chunks are gone
1357
+ count_after_delete = await adapter.count_records()
1358
+ assert count_after_delete == 2 # Only document_2 chunks remain
1359
+
1360
+ # Verify document_1 chunks are no longer searchable
1361
+ try:
1362
+ doc1_results_after = await adapter.search(
1363
+ VectorStoreQuery(query_string="Alpha")
1364
+ )
1365
+ assert len(doc1_results_after) == 0
1366
+ except Warning as w:
1367
+ # LanceDB raises a Warning for empty results
1368
+ assert "query results are empty" in str(w)
1369
+
1370
+ # Verify document_2 chunks are still there and unaffected
1371
+ doc2_results_after = await adapter.search(VectorStoreQuery(query_string="Beta"))
1372
+ assert len(doc2_results_after) == 2
1373
+ assert all("Beta" in result.chunk_text for result in doc2_results_after)
1374
+
1375
+ # Test deleting the remaining document
1376
+ await adapter.delete_nodes_by_document_id("document_2")
1377
+
1378
+ # Verify all chunks are gone
1379
+ final_count = await adapter.count_records()
1380
+ assert final_count == 0
1381
+
1382
+ # Test deleting from non-existent document (should not error)
1383
+ await adapter.delete_nodes_by_document_id("non_existent_document")
1384
+
1385
+ # Count should still be 0
1386
+ count_after_non_existent = await adapter.count_records()
1387
+ assert count_after_non_existent == 0
1388
+
1389
+
1390
+ @pytest.mark.asyncio
1391
+ async def test_delete_nodes_by_document_id_empty_table(
1392
+ fts_vector_store_config,
1393
+ embedding_config,
1394
+ create_rag_config_factory,
1395
+ ):
1396
+ """Test delete_nodes_by_document_id on empty/non-existent table."""
1397
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1398
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1399
+
1400
+ # Test deleting from empty table (should not error due to TableNotFoundError handling)
1401
+ await adapter.delete_nodes_by_document_id("some_document_id")
1402
+
1403
+ # Verify count is still 0
1404
+ count = await adapter.count_records()
1405
+ assert count == 0
1406
+
1407
+
1408
+ def generate_benchmark_data(
1409
+ doc_count: int,
1410
+ chunks_per_doc: int,
1411
+ vector_size: int,
1412
+ word_count: int,
1413
+ tmp_path: Path,
1414
+ ) -> list[DocumentWithChunksAndEmbeddings]:
1415
+ """Generate random data for benchmarking."""
1416
+
1417
+ def generate_word_pool(target_size: int) -> list[str]:
1418
+ """Generate a pool of random words using common prefixes, roots, and suffixes."""
1419
+ prefixes = [
1420
+ "pre",
1421
+ "un",
1422
+ "re",
1423
+ "in",
1424
+ "dis",
1425
+ "en",
1426
+ "non",
1427
+ "over",
1428
+ "mis",
1429
+ "sub",
1430
+ "inter",
1431
+ "super",
1432
+ "anti",
1433
+ "semi",
1434
+ "multi",
1435
+ "auto",
1436
+ "co",
1437
+ "de",
1438
+ "ex",
1439
+ "pro",
1440
+ ]
1441
+ roots = [
1442
+ "act",
1443
+ "form",
1444
+ "port",
1445
+ "dict",
1446
+ "ject",
1447
+ "rupt",
1448
+ "scrib",
1449
+ "struct",
1450
+ "tract",
1451
+ "vert",
1452
+ "vis",
1453
+ "spect",
1454
+ "mit",
1455
+ "duc",
1456
+ "fac",
1457
+ "cap",
1458
+ "cred",
1459
+ "grad",
1460
+ "loc",
1461
+ "mov",
1462
+ "ped",
1463
+ "pend",
1464
+ "pos",
1465
+ "sect",
1466
+ "sent",
1467
+ "serv",
1468
+ "sign",
1469
+ "sist",
1470
+ "spec",
1471
+ "tain",
1472
+ "temp",
1473
+ "tend",
1474
+ "terr",
1475
+ "test",
1476
+ "text",
1477
+ "tort",
1478
+ "typ",
1479
+ "urb",
1480
+ "vac",
1481
+ "val",
1482
+ "ven",
1483
+ "vers",
1484
+ "vid",
1485
+ "voc",
1486
+ "volv",
1487
+ ]
1488
+ suffixes = [
1489
+ "tion",
1490
+ "sion",
1491
+ "ness",
1492
+ "ment",
1493
+ "able",
1494
+ "ible",
1495
+ "ful",
1496
+ "less",
1497
+ "ing",
1498
+ "ed",
1499
+ "er",
1500
+ "est",
1501
+ "ly",
1502
+ "ity",
1503
+ "ous",
1504
+ "ive",
1505
+ "al",
1506
+ "ic",
1507
+ "ical",
1508
+ "ary",
1509
+ "ory",
1510
+ "ure",
1511
+ "ade",
1512
+ "age",
1513
+ "ance",
1514
+ "ence",
1515
+ "dom",
1516
+ "hood",
1517
+ "ship",
1518
+ "ward",
1519
+ "wise",
1520
+ "like",
1521
+ "some",
1522
+ "teen",
1523
+ "ty",
1524
+ "th",
1525
+ "ish",
1526
+ "esque",
1527
+ ]
1528
+
1529
+ words = set()
1530
+
1531
+ # Generate combinations
1532
+ while len(words) < target_size:
1533
+ # Simple root words
1534
+ if random.random() < 0.3:
1535
+ words.add(random.choice(roots))
1536
+ # Prefix + root
1537
+ elif random.random() < 0.6:
1538
+ words.add(random.choice(prefixes) + random.choice(roots))
1539
+ # Root + suffix
1540
+ elif random.random() < 0.8:
1541
+ words.add(random.choice(roots) + random.choice(suffixes))
1542
+ # Prefix + root + suffix
1543
+ else:
1544
+ words.add(
1545
+ random.choice(prefixes)
1546
+ + random.choice(roots)
1547
+ + random.choice(suffixes)
1548
+ )
1549
+
1550
+ return list(words)
1551
+
1552
+ # Generate word pool that's ~25x the word_count for variety
1553
+ target_pool_size = max(
1554
+ word_count * 25, 100
1555
+ ) # At least 100 words, scale dictionary with word_count*25
1556
+ words = generate_word_pool(target_pool_size)
1557
+
1558
+ results = []
1559
+ for i in range(doc_count):
1560
+ doc_id = f"doc_{i:05d}"
1561
+
1562
+ # Generate random text (word_count words) - allow repetition for variety
1563
+ selected_words = random.choices(words, k=word_count)
1564
+ text_content = " ".join(selected_words)
1565
+
1566
+ # Generate random vector_size-dimensional vector
1567
+ vector = [random.uniform(-1.0, 1.0) for _ in range(vector_size)]
1568
+
1569
+ # Create chunked document with single chunk
1570
+ chunked_document = ChunkedDocument(
1571
+ chunker_config_id="test_chunker",
1572
+ chunks=[
1573
+ Chunk(content=KilnAttachmentModel.from_data(text_content, "text/plain"))
1574
+ for _ in range(chunks_per_doc)
1575
+ ],
1576
+ path=tmp_path / f"chunked_document_{i}.kiln",
1577
+ )
1578
+
1579
+ # Create chunk embeddings
1580
+ chunk_embeddings = ChunkEmbeddings(
1581
+ embedding_config_id="test_embedding",
1582
+ embeddings=[Embedding(vector=vector) for _ in range(chunks_per_doc)],
1583
+ path=tmp_path / f"chunk_embeddings_{i}.kiln",
1584
+ )
1585
+
1586
+ results.append(
1587
+ DocumentWithChunksAndEmbeddings(
1588
+ document_id=doc_id,
1589
+ chunked_document=chunked_document,
1590
+ chunk_embeddings=chunk_embeddings,
1591
+ )
1592
+ )
1593
+
1594
+ return results
1595
+
1596
+
1597
+ @pytest.mark.benchmark
1598
+ # Not actually paid, but we want the "must be run manually" feature of the paid marker as this is very slow
1599
+ @pytest.mark.paid
1600
+ def test_benchmark_add_chunks(
1601
+ benchmark,
1602
+ hybrid_vector_store_config,
1603
+ embedding_config,
1604
+ create_rag_config_factory,
1605
+ tmp_path,
1606
+ ):
1607
+ """Benchmark adding chunks with embeddings to LanceDB."""
1608
+
1609
+ doc_count = 1000
1610
+ chunks_per_doc = 50
1611
+ vector_size = 1024
1612
+ word_count = 200
1613
+
1614
+ # Set random seed for reproducible results
1615
+ random.seed(42)
1616
+
1617
+ # Generate random data items (this is not benchmarked)
1618
+ benchmark_data = generate_benchmark_data(
1619
+ doc_count, chunks_per_doc, vector_size, word_count, tmp_path
1620
+ )
1621
+
1622
+ # Create RAG config and adapter (not benchmarked)
1623
+ rag_config = create_rag_config_factory(hybrid_vector_store_config, embedding_config)
1624
+ adapter = asyncio.run(
1625
+ vector_store_adapter_for_config(rag_config, hybrid_vector_store_config)
1626
+ )
1627
+
1628
+ # Benchmark only the index loading
1629
+ def add_chunks():
1630
+ return asyncio.run(adapter.add_chunks_with_embeddings(benchmark_data))
1631
+
1632
+ # one iteration
1633
+ benchmark.pedantic(add_chunks, rounds=1, iterations=1)
1634
+ stats = benchmark.stats.stats
1635
+
1636
+ # Verify that data was actually added
1637
+ async def verify_count():
1638
+ final_count = await adapter.count_records()
1639
+ return final_count
1640
+
1641
+ final_count = asyncio.run(verify_count())
1642
+ assert final_count == doc_count * chunks_per_doc, (
1643
+ f"Expected {doc_count} records, got {final_count}"
1644
+ )
1645
+
1646
+ # Expect min 2500 ops per second
1647
+ max_time = (doc_count * chunks_per_doc) / 2500
1648
+ if stats.max > max_time:
1649
+ pytest.fail(
1650
+ f"Average time per iteration: {stats.mean:.4f}s, expected less than {max_time:.4f}s"
1651
+ )
1652
+
1653
+
1654
+ @pytest.mark.asyncio
1655
+ async def test_delete_nodes_not_in_set_basic_functionality(
1656
+ fts_vector_store_config,
1657
+ mock_chunked_documents,
1658
+ embedding_config,
1659
+ create_rag_config_factory,
1660
+ ):
1661
+ """Test basic functionality of delete_nodes_not_in_set - keep some docs, delete others."""
1662
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1663
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1664
+
1665
+ # Add both documents (doc_001 and doc_002)
1666
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
1667
+
1668
+ # Verify both documents are there (4 chunks each = 8 total)
1669
+ initial_count = await adapter.count_records()
1670
+ assert initial_count == 8
1671
+
1672
+ # Keep only doc_001, delete doc_002
1673
+ keep_set = {"doc_001"}
1674
+ await adapter.delete_nodes_not_in_set(keep_set)
1675
+
1676
+ # Verify only doc_001 chunks remain
1677
+ final_count = await adapter.count_records()
1678
+ assert final_count == 4
1679
+
1680
+ # Verify doc_001 chunks are still searchable
1681
+ doc1_results = await adapter.search(VectorStoreQuery(query_string="population"))
1682
+ assert len(doc1_results) > 0
1683
+ assert all("doc_001" == result.document_id for result in doc1_results)
1684
+
1685
+ # Verify doc_002 chunks are gone
1686
+ doc2_results = await adapter.search(VectorStoreQuery(query_string="area"))
1687
+ assert len(doc2_results) == 0
1688
+
1689
+
1690
+ @pytest.mark.asyncio
1691
+ async def test_delete_nodes_not_in_set_empty_set(
1692
+ fts_vector_store_config,
1693
+ mock_chunked_documents,
1694
+ embedding_config,
1695
+ create_rag_config_factory,
1696
+ ):
1697
+ """Test delete_nodes_not_in_set with empty set - should delete all nodes."""
1698
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1699
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1700
+
1701
+ # Add both documents
1702
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
1703
+
1704
+ # Verify documents are there
1705
+ initial_count = await adapter.count_records()
1706
+ assert initial_count == 8
1707
+
1708
+ # Delete all nodes (empty keep set)
1709
+ empty_set = set()
1710
+ await adapter.delete_nodes_not_in_set(empty_set)
1711
+
1712
+ # Verify all nodes are deleted
1713
+ final_count = await adapter.count_records()
1714
+ assert final_count == 0
1715
+
1716
+
1717
+ @pytest.mark.asyncio
1718
+ async def test_delete_nodes_not_in_set_complete_set(
1719
+ fts_vector_store_config,
1720
+ mock_chunked_documents,
1721
+ embedding_config,
1722
+ create_rag_config_factory,
1723
+ ):
1724
+ """Test delete_nodes_not_in_set with complete set - should delete no nodes."""
1725
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1726
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1727
+
1728
+ # Add both documents
1729
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
1730
+
1731
+ # Verify documents are there
1732
+ initial_count = await adapter.count_records()
1733
+ assert initial_count == 8
1734
+
1735
+ # Keep all documents
1736
+ complete_set = {"doc_001", "doc_002"}
1737
+ await adapter.delete_nodes_not_in_set(complete_set)
1738
+
1739
+ # Verify no nodes are deleted
1740
+ final_count = await adapter.count_records()
1741
+ assert final_count == 8
1742
+
1743
+ # Verify both documents are still searchable
1744
+ doc1_results = await adapter.search(VectorStoreQuery(query_string="population"))
1745
+ assert len(doc1_results) > 0
1746
+
1747
+ doc2_results = await adapter.search(VectorStoreQuery(query_string="area"))
1748
+ assert len(doc2_results) > 0
1749
+
1750
+
1751
+ @pytest.mark.asyncio
1752
+ async def test_delete_nodes_not_in_set_partial_set(
1753
+ fts_vector_store_config,
1754
+ embedding_config,
1755
+ create_rag_config_factory,
1756
+ tmp_path,
1757
+ ):
1758
+ """Test delete_nodes_not_in_set with partial set - keep some, delete others."""
1759
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1760
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1761
+
1762
+ # Create three documents for more complex testing
1763
+ three_docs_data = {
1764
+ "keep_doc_1": [{"vector": [1.0, 1.0], "text": "Keep document one content"}],
1765
+ "delete_doc_2": [{"vector": [2.0, 2.0], "text": "Delete document two content"}],
1766
+ "keep_doc_3": [{"vector": [3.0, 3.0], "text": "Keep document three content"}],
1767
+ }
1768
+ three_docs = dicts_to_indexable_docs(three_docs_data, tmp_path)
1769
+
1770
+ # Add all three documents
1771
+ await adapter.add_chunks_with_embeddings(three_docs)
1772
+
1773
+ # Verify all documents are there
1774
+ initial_count = await adapter.count_records()
1775
+ assert initial_count == 3
1776
+
1777
+ # Keep documents 1 and 3, delete document 2
1778
+ keep_set = {"keep_doc_1", "keep_doc_3"}
1779
+ await adapter.delete_nodes_not_in_set(keep_set)
1780
+
1781
+ # Verify only 2 documents remain
1782
+ final_count = await adapter.count_records()
1783
+ assert final_count == 2
1784
+
1785
+ # Verify kept documents are still searchable using more specific terms
1786
+ keep1_results = await adapter.search(VectorStoreQuery(query_string="one"))
1787
+ assert len(keep1_results) == 1
1788
+ assert keep1_results[0].document_id == "keep_doc_1"
1789
+
1790
+ keep3_results = await adapter.search(VectorStoreQuery(query_string="three"))
1791
+ assert len(keep3_results) == 1
1792
+ assert keep3_results[0].document_id == "keep_doc_3"
1793
+
1794
+ # Verify deleted document is gone
1795
+ delete_results = await adapter.search(VectorStoreQuery(query_string="two"))
1796
+ assert len(delete_results) == 0
1797
+
1798
+
1799
+ @pytest.mark.asyncio
1800
+ async def test_delete_nodes_not_in_set_uninitialized_table(
1801
+ fts_vector_store_config,
1802
+ embedding_config,
1803
+ create_rag_config_factory,
1804
+ ):
1805
+ """Test delete_nodes_not_in_set with uninitialized table - should raise TableNotFoundError."""
1806
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1807
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1808
+
1809
+ # Don't add any data, so table remains uninitialized
1810
+ # The table property will raise TableNotFoundError when accessed
1811
+ with pytest.raises(TableNotFoundError, match="Table vectors is not initialized"):
1812
+ await adapter.delete_nodes_not_in_set({"doc_001"})
1813
+
1814
+
1815
+ @pytest.mark.asyncio
1816
+ async def test_delete_nodes_not_in_set_empty_table(
1817
+ fts_vector_store_config,
1818
+ mock_chunked_documents,
1819
+ embedding_config,
1820
+ create_rag_config_factory,
1821
+ ):
1822
+ """Test delete_nodes_not_in_set with empty table - should handle gracefully."""
1823
+ rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1824
+ adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
1825
+
1826
+ # Create table by adding data, then delete all to make it empty
1827
+ await adapter.add_chunks_with_embeddings(mock_chunked_documents)
1828
+
1829
+ # Delete all documents to make table empty but initialized
1830
+ await adapter.delete_nodes_not_in_set(set()) # Empty set deletes everything
1831
+
1832
+ # Verify table is empty
1833
+ initial_count = await adapter.count_records()
1834
+ assert initial_count == 0
1835
+
1836
+ # Try to delete from empty table - should not error
1837
+ await adapter.delete_nodes_not_in_set({"doc_001"})
1838
+
1839
+ # Verify count is still 0
1840
+ final_count = await adapter.count_records()
1841
+ assert final_count == 0