kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +8 -2
- kiln_ai/adapters/adapter_registry.py +43 -208
- kiln_ai/adapters/chat/chat_formatter.py +8 -12
- kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/docker_model_runner_tools.py +119 -0
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/base_eval.py +2 -2
- kiln_ai/adapters/eval/eval_runner.py +9 -3
- kiln_ai/adapters/eval/g_eval.py +2 -2
- kiln_ai/adapters/eval/test_base_eval.py +2 -4
- kiln_ai/adapters/eval/test_g_eval.py +4 -5
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
- kiln_ai/adapters/fine_tune/__init__.py +1 -1
- kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +192 -0
- kiln_ai/adapters/ml_model_list.py +761 -37
- kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
- kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
- kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
- kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/parsers/__init__.py +1 -1
- kiln_ai/adapters/provider_tools.py +205 -47
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/repair/test_repair_task.py +12 -9
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +657 -85
- kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
- kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
- kiln_ai/adapters/test_ml_model_list.py +251 -1
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_adaptors.py +13 -6
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +254 -8
- kiln_ai/adapters/test_remote_config.py +651 -58
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +39 -34
- kiln_ai/datamodel/basemodel.py +170 -1
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +28 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/eval.py +1 -1
- kiln_ai/datamodel/external_tool_server.py +298 -0
- kiln_ai/datamodel/extraction.py +303 -0
- kiln_ai/datamodel/json_schema.py +25 -10
- kiln_ai/datamodel/project.py +40 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/registry.py +0 -15
- kiln_ai/datamodel/run_config.py +62 -0
- kiln_ai/datamodel/task.py +2 -77
- kiln_ai/datamodel/task_output.py +6 -1
- kiln_ai/datamodel/task_run.py +41 -0
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +4 -4
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_example_models.py +175 -0
- kiln_ai/datamodel/test_external_tool_server.py +691 -0
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +470 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_registry.py +8 -3
- kiln_ai/datamodel/test_task.py +15 -47
- kiln_ai/datamodel/test_tool_id.py +320 -0
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +105 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/__init__.py +8 -0
- kiln_ai/tools/base_tool.py +82 -0
- kiln_ai/tools/built_in_tools/__init__.py +13 -0
- kiln_ai/tools/built_in_tools/math_tools.py +124 -0
- kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
- kiln_ai/tools/mcp_server_tool.py +95 -0
- kiln_ai/tools/mcp_session_manager.py +246 -0
- kiln_ai/tools/rag_tools.py +157 -0
- kiln_ai/tools/test_base_tools.py +199 -0
- kiln_ai/tools/test_mcp_server_tool.py +457 -0
- kiln_ai/tools/test_mcp_session_manager.py +1585 -0
- kiln_ai/tools/test_rag_tools.py +848 -0
- kiln_ai/tools/test_tool_registry.py +562 -0
- kiln_ai/tools/tool_registry.py +85 -0
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +24 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +94 -0
- kiln_ai/utils/pdf_utils.py +38 -0
- kiln_ai/utils/project_utils.py +17 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_config.py +138 -1
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +131 -0
- kiln_ai/utils/test_pdf_utils.py +73 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
- kiln_ai-0.21.0.dist-info/RECORD +211 -0
- kiln_ai-0.19.0.dist-info/RECORD +0 -115
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,1841 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import uuid
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, List
|
|
7
|
+
from unittest.mock import patch
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from llama_index.core.schema import MetadataMode, NodeRelationship
|
|
11
|
+
from llama_index.core.vector_stores.types import VectorStoreQueryResult
|
|
12
|
+
from llama_index.vector_stores.lancedb.base import TableNotFoundError
|
|
13
|
+
|
|
14
|
+
from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
15
|
+
DocumentWithChunksAndEmbeddings,
|
|
16
|
+
SearchResult,
|
|
17
|
+
VectorStoreQuery,
|
|
18
|
+
)
|
|
19
|
+
from kiln_ai.adapters.vector_store.lancedb_adapter import LanceDBAdapter
|
|
20
|
+
from kiln_ai.adapters.vector_store.vector_store_registry import (
|
|
21
|
+
vector_store_adapter_for_config,
|
|
22
|
+
)
|
|
23
|
+
from kiln_ai.datamodel.basemodel import KilnAttachmentModel
|
|
24
|
+
from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
|
|
25
|
+
from kiln_ai.datamodel.datamodel_enums import ModelProviderName
|
|
26
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding, EmbeddingConfig
|
|
27
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
28
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
|
|
29
|
+
from kiln_ai.utils.config import Config
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_all_nodes(adapter: LanceDBAdapter) -> List[SearchResult]:
|
|
33
|
+
nodes = adapter.lancedb_vector_store.get_nodes()
|
|
34
|
+
return [
|
|
35
|
+
SearchResult(
|
|
36
|
+
document_id=node.metadata["kiln_doc_id"],
|
|
37
|
+
chunk_idx=node.metadata["kiln_chunk_idx"],
|
|
38
|
+
chunk_text=node.get_content(MetadataMode.NONE),
|
|
39
|
+
similarity=None,
|
|
40
|
+
)
|
|
41
|
+
for node in nodes
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture(autouse=True)
|
|
46
|
+
def patch_settings_dir(tmp_path):
|
|
47
|
+
with patch("kiln_ai.utils.config.Config.settings_dir", return_value=tmp_path):
|
|
48
|
+
yield
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.fixture
|
|
52
|
+
def hybrid_vector_store_config():
|
|
53
|
+
"""Create a vector store config for testing."""
|
|
54
|
+
return VectorStoreConfig(
|
|
55
|
+
name="test_config",
|
|
56
|
+
store_type=VectorStoreType.LANCE_DB_HYBRID,
|
|
57
|
+
properties={
|
|
58
|
+
"similarity_top_k": 10,
|
|
59
|
+
"nprobes": 10,
|
|
60
|
+
"overfetch_factor": 10,
|
|
61
|
+
"vector_column_name": "vector",
|
|
62
|
+
"text_key": "text",
|
|
63
|
+
"doc_id_key": "doc_id",
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@pytest.fixture
|
|
69
|
+
def fts_vector_store_config():
|
|
70
|
+
"""Create a vector store config for testing."""
|
|
71
|
+
return VectorStoreConfig(
|
|
72
|
+
name="test_config",
|
|
73
|
+
store_type=VectorStoreType.LANCE_DB_FTS,
|
|
74
|
+
properties={
|
|
75
|
+
"similarity_top_k": 10,
|
|
76
|
+
"overfetch_factor": 10,
|
|
77
|
+
"vector_column_name": "vector",
|
|
78
|
+
"text_key": "text",
|
|
79
|
+
"doc_id_key": "doc_id",
|
|
80
|
+
},
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.fixture
|
|
85
|
+
def knn_vector_store_config():
|
|
86
|
+
"""Create a vector store config for testing."""
|
|
87
|
+
return VectorStoreConfig(
|
|
88
|
+
name="test_config",
|
|
89
|
+
store_type=VectorStoreType.LANCE_DB_VECTOR,
|
|
90
|
+
properties={
|
|
91
|
+
"similarity_top_k": 10,
|
|
92
|
+
"nprobes": 10,
|
|
93
|
+
"overfetch_factor": 10,
|
|
94
|
+
"vector_column_name": "vector",
|
|
95
|
+
"text_key": "text",
|
|
96
|
+
"doc_id_key": "doc_id",
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@pytest.fixture
|
|
102
|
+
def embedding_config():
|
|
103
|
+
"""Create an embedding config for testing."""
|
|
104
|
+
return EmbeddingConfig(
|
|
105
|
+
name="test_embedding",
|
|
106
|
+
model_provider_name=ModelProviderName.openai,
|
|
107
|
+
model_name="text-embedding-ada-002",
|
|
108
|
+
properties={},
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@pytest.fixture
|
|
113
|
+
def create_rag_config_factory() -> Callable[
|
|
114
|
+
[VectorStoreConfig, EmbeddingConfig], RagConfig
|
|
115
|
+
]:
|
|
116
|
+
def create_rag_config(
|
|
117
|
+
vector_store_config: VectorStoreConfig, embedding_config: EmbeddingConfig
|
|
118
|
+
) -> RagConfig:
|
|
119
|
+
return RagConfig(
|
|
120
|
+
name="test_rag",
|
|
121
|
+
tool_name="test_rag_tool",
|
|
122
|
+
tool_description="A test RAG tool for vector search",
|
|
123
|
+
extractor_config_id="test_extractor",
|
|
124
|
+
chunker_config_id="test_chunker",
|
|
125
|
+
embedding_config_id=embedding_config.id,
|
|
126
|
+
vector_store_config_id=vector_store_config.id,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return create_rag_config
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def dicts_to_indexable_docs(
|
|
133
|
+
docs: dict[str, list[dict[str, str | list[float]]]], tmp_path: Path
|
|
134
|
+
) -> list[DocumentWithChunksAndEmbeddings]:
|
|
135
|
+
results = []
|
|
136
|
+
for doc_id, doc in docs.items():
|
|
137
|
+
chunked_documents = ChunkedDocument(
|
|
138
|
+
chunker_config_id="test_chunker",
|
|
139
|
+
chunks=[],
|
|
140
|
+
path=tmp_path / "chunked_document.kiln",
|
|
141
|
+
)
|
|
142
|
+
chunk_embeddings = ChunkEmbeddings(
|
|
143
|
+
embedding_config_id="test_embedding",
|
|
144
|
+
embeddings=[],
|
|
145
|
+
path=tmp_path / "chunk_embeddings.kiln",
|
|
146
|
+
)
|
|
147
|
+
for part in doc:
|
|
148
|
+
# Ensure vector is a list of floats
|
|
149
|
+
vector = part["vector"]
|
|
150
|
+
if isinstance(vector, list):
|
|
151
|
+
vector = [float(x) for x in vector]
|
|
152
|
+
else:
|
|
153
|
+
vector = [float(vector)]
|
|
154
|
+
|
|
155
|
+
chunk_embeddings.embeddings.append(Embedding(vector=vector))
|
|
156
|
+
chunked_documents.chunks.append(
|
|
157
|
+
Chunk(
|
|
158
|
+
content=KilnAttachmentModel.from_data(
|
|
159
|
+
str(part["text"]),
|
|
160
|
+
"text/plain",
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
results.append(
|
|
165
|
+
DocumentWithChunksAndEmbeddings(
|
|
166
|
+
document_id=doc_id,
|
|
167
|
+
chunked_document=chunked_documents,
|
|
168
|
+
chunk_embeddings=chunk_embeddings,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return results
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@pytest.fixture
|
|
176
|
+
def mock_chunked_documents(tmp_path):
|
|
177
|
+
"""Create sample chunks for testing."""
|
|
178
|
+
docs: dict[str, list[dict[str, str | list[float]]]] = {
|
|
179
|
+
"doc_001": [
|
|
180
|
+
{
|
|
181
|
+
"vector": [1.1, 1.2],
|
|
182
|
+
"text": "The population of Tokyo, Japan is approximately 37 million people",
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"vector": [0.2, 1.8],
|
|
186
|
+
"text": "New York City, USA has a population of about 8.8 million residents",
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"vector": [0.45452, 51.8],
|
|
190
|
+
"text": "London, UK has a population of roughly 9 million people",
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"vector": [0.7, 0.8],
|
|
194
|
+
"text": "Rio de Janeiro, Brazil has a population of about 6.7 million residents",
|
|
195
|
+
},
|
|
196
|
+
],
|
|
197
|
+
"doc_002": [
|
|
198
|
+
{
|
|
199
|
+
"vector": [50.0, 50.0],
|
|
200
|
+
"text": "The area of Tokyo, Japan is approximately 2,191 square kilometers",
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
"vector": [55.0, 55.0],
|
|
204
|
+
"text": "The area of New York City, USA is approximately 783.8 square kilometers",
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
"vector": [60.0, 60.0],
|
|
208
|
+
"text": "The area of London, UK is approximately 1,572 square kilometers",
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"vector": [65.0, 65.0],
|
|
212
|
+
"text": "The area of Rio de Janeiro, Brazil is approximately 1,256 square kilometers",
|
|
213
|
+
},
|
|
214
|
+
],
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return dicts_to_indexable_docs(docs, tmp_path)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@pytest.mark.asyncio
|
|
221
|
+
async def test_add_chunks_with_embeddings_and_similarity_search(
|
|
222
|
+
knn_vector_store_config,
|
|
223
|
+
mock_chunked_documents,
|
|
224
|
+
embedding_config,
|
|
225
|
+
create_rag_config_factory,
|
|
226
|
+
):
|
|
227
|
+
"""Test adding chunks and similarity search."""
|
|
228
|
+
|
|
229
|
+
rag_config = create_rag_config_factory(knn_vector_store_config, embedding_config)
|
|
230
|
+
|
|
231
|
+
# Create adapter using the registry
|
|
232
|
+
adapter = await vector_store_adapter_for_config(rag_config, knn_vector_store_config)
|
|
233
|
+
|
|
234
|
+
# Add chunks to the vector store
|
|
235
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
236
|
+
|
|
237
|
+
# Test similarity search - search for a vector close to [55.0, 55.0] (NYC area chunk)
|
|
238
|
+
query_vector = [55.0, 55.0]
|
|
239
|
+
|
|
240
|
+
results = await adapter.search(VectorStoreQuery(query_embedding=query_vector))
|
|
241
|
+
|
|
242
|
+
# The closest should be NYC area chunk with vector [55.0, 55.0]
|
|
243
|
+
assert len(results) > 0
|
|
244
|
+
assert "New York City" in results[0].chunk_text
|
|
245
|
+
assert "783.8 square kilometers" in results[0].chunk_text
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@pytest.mark.asyncio
|
|
249
|
+
async def test_fts_search(
|
|
250
|
+
fts_vector_store_config,
|
|
251
|
+
mock_chunked_documents,
|
|
252
|
+
embedding_config,
|
|
253
|
+
create_rag_config_factory,
|
|
254
|
+
):
|
|
255
|
+
"""Test full-text search functionality."""
|
|
256
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
257
|
+
|
|
258
|
+
adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
|
|
259
|
+
|
|
260
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
261
|
+
|
|
262
|
+
assert isinstance(adapter, LanceDBAdapter)
|
|
263
|
+
|
|
264
|
+
# Test FTS search for "London"
|
|
265
|
+
query_text = "london"
|
|
266
|
+
|
|
267
|
+
results = await adapter.search(VectorStoreQuery(query_string=query_text))
|
|
268
|
+
|
|
269
|
+
# Should find both London chunks
|
|
270
|
+
assert len(results) >= 2
|
|
271
|
+
london_texts = [result.chunk_text for result in results]
|
|
272
|
+
assert any("London, UK has a population" in text for text in london_texts)
|
|
273
|
+
assert any("The area of London, UK" in text for text in london_texts)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
@pytest.mark.asyncio
|
|
277
|
+
async def test_hybrid_search(
|
|
278
|
+
hybrid_vector_store_config,
|
|
279
|
+
mock_chunked_documents,
|
|
280
|
+
embedding_config,
|
|
281
|
+
create_rag_config_factory,
|
|
282
|
+
):
|
|
283
|
+
"""Test hybrid search combining vector and text search."""
|
|
284
|
+
rag_config = create_rag_config_factory(hybrid_vector_store_config, embedding_config)
|
|
285
|
+
|
|
286
|
+
adapter = await vector_store_adapter_for_config(
|
|
287
|
+
rag_config, hybrid_vector_store_config
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
291
|
+
|
|
292
|
+
# Test hybrid search - combine text "Tokyo" with vector close to Tokyo population vector [1.1, 1.2]
|
|
293
|
+
query_text = "Tokyo"
|
|
294
|
+
query_vector = [1.1, 1.2]
|
|
295
|
+
|
|
296
|
+
results = await adapter.search(
|
|
297
|
+
VectorStoreQuery(query_string=query_text, query_embedding=query_vector)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Should find Tokyo-related chunks, with population chunk being highly ranked
|
|
301
|
+
assert len(results) > 0
|
|
302
|
+
tokyo_results = [result for result in results if "Tokyo" in result.chunk_text]
|
|
303
|
+
assert len(tokyo_results) >= 2 # Both Tokyo chunks should be found
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def test_upsert_behavior(
|
|
307
|
+
fts_vector_store_config,
|
|
308
|
+
mock_chunked_documents,
|
|
309
|
+
embedding_config,
|
|
310
|
+
create_rag_config_factory,
|
|
311
|
+
):
|
|
312
|
+
"""Test that adding the same chunks multiple times works (upsert behavior)."""
|
|
313
|
+
|
|
314
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
315
|
+
|
|
316
|
+
adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
|
|
317
|
+
|
|
318
|
+
# Extract first document only
|
|
319
|
+
first_doc = [mock_chunked_documents[0]]
|
|
320
|
+
|
|
321
|
+
await adapter.add_chunks_with_embeddings(first_doc)
|
|
322
|
+
|
|
323
|
+
# Search to verify it's there
|
|
324
|
+
results1 = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
|
|
325
|
+
|
|
326
|
+
# Add the same document again
|
|
327
|
+
await adapter.add_chunks_with_embeddings(first_doc)
|
|
328
|
+
|
|
329
|
+
# Search again - should still find the same chunks (not duplicated)
|
|
330
|
+
results2 = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
|
|
331
|
+
|
|
332
|
+
# Should find Tokyo chunks but behavior may vary based on LanceDB implementation
|
|
333
|
+
assert len(results2) == len(results1)
|
|
334
|
+
|
|
335
|
+
# Add all documents
|
|
336
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
337
|
+
|
|
338
|
+
# Final search
|
|
339
|
+
results3 = await adapter.search(VectorStoreQuery(query_string="population"))
|
|
340
|
+
|
|
341
|
+
assert len(results3) > 0
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
@pytest.mark.asyncio
|
|
345
|
+
async def test_count_records_empty_store(
|
|
346
|
+
fts_vector_store_config, embedding_config, create_rag_config_factory
|
|
347
|
+
):
|
|
348
|
+
"""Test counting records in an empty vector store."""
|
|
349
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
350
|
+
|
|
351
|
+
adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
|
|
352
|
+
|
|
353
|
+
assert await adapter.count_records() == 0
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
@pytest.mark.asyncio
|
|
357
|
+
async def test_count_records_with_data(
|
|
358
|
+
fts_vector_store_config,
|
|
359
|
+
mock_chunked_documents,
|
|
360
|
+
embedding_config,
|
|
361
|
+
create_rag_config_factory,
|
|
362
|
+
):
|
|
363
|
+
"""Test counting records after adding data."""
|
|
364
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
365
|
+
|
|
366
|
+
adapter = await vector_store_adapter_for_config(rag_config, fts_vector_store_config)
|
|
367
|
+
|
|
368
|
+
# Add chunks first to create the table
|
|
369
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
370
|
+
|
|
371
|
+
# Should now have records (8 chunks total across both documents)
|
|
372
|
+
final_count = await adapter.count_records()
|
|
373
|
+
assert final_count == 8
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@pytest.mark.asyncio
|
|
377
|
+
async def test_get_all_chunks(
|
|
378
|
+
fts_vector_store_config,
|
|
379
|
+
mock_chunked_documents,
|
|
380
|
+
embedding_config,
|
|
381
|
+
create_rag_config_factory,
|
|
382
|
+
):
|
|
383
|
+
"""Test getting all chunks from the vector store."""
|
|
384
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
385
|
+
|
|
386
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
387
|
+
|
|
388
|
+
# Add chunks first to create the table
|
|
389
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
390
|
+
|
|
391
|
+
# Get all chunks
|
|
392
|
+
all_chunks = get_all_nodes(adapter)
|
|
393
|
+
assert len(all_chunks) == 8 # 8 chunks total
|
|
394
|
+
|
|
395
|
+
# Verify structure
|
|
396
|
+
for chunk in all_chunks:
|
|
397
|
+
assert chunk.document_id in ["doc_001", "doc_002"]
|
|
398
|
+
assert len(chunk.chunk_text) > 0
|
|
399
|
+
assert chunk.similarity is None # get_all_chunks doesn't include similarity
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def test_format_query_result_error_conditions(
|
|
403
|
+
fts_vector_store_config, embedding_config, create_rag_config_factory
|
|
404
|
+
):
|
|
405
|
+
"""Test error handling in format_query_result method."""
|
|
406
|
+
|
|
407
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
408
|
+
|
|
409
|
+
# Create adapter with minimal setup
|
|
410
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
411
|
+
|
|
412
|
+
# Test with None ids - should return empty list instead of raising error
|
|
413
|
+
query_result = VectorStoreQueryResult(ids=None, nodes=[], similarities=[])
|
|
414
|
+
result = adapter.format_query_result(query_result)
|
|
415
|
+
assert result == []
|
|
416
|
+
|
|
417
|
+
# Test with None nodes - should return empty list instead of raising error
|
|
418
|
+
query_result = VectorStoreQueryResult(ids=[], nodes=None, similarities=[])
|
|
419
|
+
result = adapter.format_query_result(query_result)
|
|
420
|
+
assert result == []
|
|
421
|
+
|
|
422
|
+
# Test with None similarities - should return empty list instead of raising error
|
|
423
|
+
query_result = VectorStoreQueryResult(ids=[], nodes=[], similarities=None)
|
|
424
|
+
result = adapter.format_query_result(query_result)
|
|
425
|
+
assert result == []
|
|
426
|
+
|
|
427
|
+
# Test with empty lists - should return empty list (valid empty result)
|
|
428
|
+
query_result = VectorStoreQueryResult(ids=[], nodes=[], similarities=[])
|
|
429
|
+
result = adapter.format_query_result(query_result)
|
|
430
|
+
assert result == []
|
|
431
|
+
|
|
432
|
+
# Test with mismatched lengths where some arrays are empty - should return empty list
|
|
433
|
+
query_result = VectorStoreQueryResult(ids=["1", "2"], nodes=[], similarities=[])
|
|
434
|
+
with pytest.raises(
|
|
435
|
+
ValueError, match="ids, nodes, and similarities must have the same length"
|
|
436
|
+
):
|
|
437
|
+
adapter.format_query_result(query_result)
|
|
438
|
+
|
|
439
|
+
# Test with mismatched lengths where all arrays are non-empty - should raise ValueError
|
|
440
|
+
from llama_index.core.schema import TextNode
|
|
441
|
+
|
|
442
|
+
node1 = TextNode(text="test1")
|
|
443
|
+
query_result = VectorStoreQueryResult(
|
|
444
|
+
ids=["1", "2"], nodes=[node1], similarities=[0.5, 0.3]
|
|
445
|
+
)
|
|
446
|
+
with pytest.raises(
|
|
447
|
+
ValueError, match="ids, nodes, and similarities must have the same length"
|
|
448
|
+
):
|
|
449
|
+
adapter.format_query_result(query_result)
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def test_build_kwargs_for_query_validation_errors(
|
|
453
|
+
create_rag_config_factory,
|
|
454
|
+
hybrid_vector_store_config,
|
|
455
|
+
fts_vector_store_config,
|
|
456
|
+
knn_vector_store_config,
|
|
457
|
+
embedding_config,
|
|
458
|
+
):
|
|
459
|
+
"""Test error handling in build_kwargs_for_query method."""
|
|
460
|
+
|
|
461
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
462
|
+
|
|
463
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
464
|
+
|
|
465
|
+
# Test FTS search without query_string
|
|
466
|
+
query = VectorStoreQuery(query_string=None, query_embedding=None)
|
|
467
|
+
with pytest.raises(
|
|
468
|
+
ValueError, match="query_string must be provided for fts search"
|
|
469
|
+
):
|
|
470
|
+
adapter.build_kwargs_for_query(query)
|
|
471
|
+
|
|
472
|
+
# Test HYBRID search without required parameters
|
|
473
|
+
adapter = LanceDBAdapter(rag_config, hybrid_vector_store_config)
|
|
474
|
+
|
|
475
|
+
query = VectorStoreQuery(query_string=None, query_embedding=[1.0, 2.0])
|
|
476
|
+
with pytest.raises(
|
|
477
|
+
ValueError,
|
|
478
|
+
match="query_string and query_embedding must be provided for hybrid search",
|
|
479
|
+
):
|
|
480
|
+
adapter.build_kwargs_for_query(query)
|
|
481
|
+
|
|
482
|
+
query = VectorStoreQuery(query_string="test", query_embedding=None)
|
|
483
|
+
with pytest.raises(
|
|
484
|
+
ValueError,
|
|
485
|
+
match="query_string and query_embedding must be provided for hybrid search",
|
|
486
|
+
):
|
|
487
|
+
adapter.build_kwargs_for_query(query)
|
|
488
|
+
|
|
489
|
+
# Test VECTOR search without embedding
|
|
490
|
+
adapter = LanceDBAdapter(rag_config, knn_vector_store_config)
|
|
491
|
+
|
|
492
|
+
query = VectorStoreQuery(query_string=None, query_embedding=None)
|
|
493
|
+
with pytest.raises(
|
|
494
|
+
ValueError, match="query_embedding must be provided for vector search"
|
|
495
|
+
):
|
|
496
|
+
adapter.build_kwargs_for_query(query)
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
@pytest.mark.asyncio
|
|
500
|
+
async def test_search_with_table_not_found_error(
|
|
501
|
+
fts_vector_store_config, embedding_config, create_rag_config_factory
|
|
502
|
+
):
|
|
503
|
+
"""Test that search handles TableNotFoundError gracefully"""
|
|
504
|
+
|
|
505
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
506
|
+
|
|
507
|
+
# Create the adapter normally
|
|
508
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
509
|
+
|
|
510
|
+
# Mock the aquery method directly on the LanceDBVectorStore class
|
|
511
|
+
with patch.object(adapter.lancedb_vector_store.__class__, "aquery") as mock_aquery:
|
|
512
|
+
mock_aquery.side_effect = TableNotFoundError("Table vectors is not initialized")
|
|
513
|
+
|
|
514
|
+
# Search should return empty list instead of raising error
|
|
515
|
+
query = VectorStoreQuery(query_string="test query")
|
|
516
|
+
results = await adapter.search(query)
|
|
517
|
+
|
|
518
|
+
assert results == []
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
@pytest.mark.asyncio
|
|
522
|
+
async def test_search_with_empty_results_error(
|
|
523
|
+
fts_vector_store_config,
|
|
524
|
+
embedding_config,
|
|
525
|
+
create_rag_config_factory,
|
|
526
|
+
):
|
|
527
|
+
"""Test that search handles 'query results are empty' error gracefully"""
|
|
528
|
+
|
|
529
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
530
|
+
|
|
531
|
+
# Create the adapter normally
|
|
532
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
533
|
+
|
|
534
|
+
# Search should return empty list instead of raising error
|
|
535
|
+
query = VectorStoreQuery(query_string="test query")
|
|
536
|
+
results = await adapter.search(query)
|
|
537
|
+
|
|
538
|
+
assert results == []
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
async def test_destroy(
|
|
542
|
+
fts_vector_store_config,
|
|
543
|
+
mock_chunked_documents,
|
|
544
|
+
embedding_config,
|
|
545
|
+
create_rag_config_factory,
|
|
546
|
+
):
|
|
547
|
+
"""Test the destroy method removes the database directory."""
|
|
548
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
549
|
+
|
|
550
|
+
adapter = LanceDBAdapter(
|
|
551
|
+
rag_config,
|
|
552
|
+
fts_vector_store_config,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
# Add some data to create the database
|
|
556
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
557
|
+
|
|
558
|
+
# Verify data exists
|
|
559
|
+
count = await adapter.count_records()
|
|
560
|
+
assert count == 8
|
|
561
|
+
|
|
562
|
+
# Get the database path
|
|
563
|
+
db_path = LanceDBAdapter.lancedb_path_for_config(rag_config)
|
|
564
|
+
assert os.path.exists(db_path)
|
|
565
|
+
|
|
566
|
+
# Destroy the database
|
|
567
|
+
await adapter.destroy()
|
|
568
|
+
|
|
569
|
+
# Verify the database directory is gone
|
|
570
|
+
assert not os.path.exists(db_path)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def test_lancedb_path_for_config():
|
|
574
|
+
"""Test the lancedb_path_for_config static method."""
|
|
575
|
+
# Test with valid rag_config
|
|
576
|
+
rag_config = RagConfig(
|
|
577
|
+
name="test_rag",
|
|
578
|
+
tool_name="test_rag_tool",
|
|
579
|
+
tool_description="A test RAG tool for path testing",
|
|
580
|
+
extractor_config_id="test_extractor",
|
|
581
|
+
chunker_config_id="test_chunker",
|
|
582
|
+
embedding_config_id="test_embedding",
|
|
583
|
+
vector_store_config_id="test_vector_store",
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
expected_path = str(
|
|
587
|
+
Path(Config.settings_dir()) / "rag_indexes" / "lancedb" / str(rag_config.id)
|
|
588
|
+
)
|
|
589
|
+
actual_path = LanceDBAdapter.lancedb_path_for_config(rag_config)
|
|
590
|
+
|
|
591
|
+
assert actual_path == expected_path
|
|
592
|
+
|
|
593
|
+
# Test with rag_config with no ID (should raise ValueError)
|
|
594
|
+
rag_config_no_id = RagConfig(
|
|
595
|
+
name="test_rag",
|
|
596
|
+
tool_name="test_rag_tool",
|
|
597
|
+
tool_description="A test RAG tool with no ID",
|
|
598
|
+
extractor_config_id="test_extractor",
|
|
599
|
+
chunker_config_id="test_chunker",
|
|
600
|
+
embedding_config_id="test_embedding",
|
|
601
|
+
vector_store_config_id="test_vector_store",
|
|
602
|
+
)
|
|
603
|
+
rag_config_no_id.id = None
|
|
604
|
+
|
|
605
|
+
with pytest.raises(ValueError, match="Vector store config ID is required"):
|
|
606
|
+
LanceDBAdapter.lancedb_path_for_config(rag_config_no_id)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def test_query_type_property(
|
|
610
|
+
embedding_config,
|
|
611
|
+
create_rag_config_factory,
|
|
612
|
+
):
|
|
613
|
+
"""Test the query_type property returns correct values for different store types."""
|
|
614
|
+
|
|
615
|
+
# Test FTS query type
|
|
616
|
+
fts_config = VectorStoreConfig(
|
|
617
|
+
name="fts_test",
|
|
618
|
+
store_type=VectorStoreType.LANCE_DB_FTS,
|
|
619
|
+
properties={
|
|
620
|
+
"similarity_top_k": 10,
|
|
621
|
+
"overfetch_factor": 10,
|
|
622
|
+
"vector_column_name": "vector",
|
|
623
|
+
"text_key": "text",
|
|
624
|
+
"doc_id_key": "doc_id",
|
|
625
|
+
},
|
|
626
|
+
)
|
|
627
|
+
rag_config = create_rag_config_factory(fts_config, embedding_config)
|
|
628
|
+
|
|
629
|
+
adapter = LanceDBAdapter(rag_config, fts_config)
|
|
630
|
+
assert adapter.query_type == "fts"
|
|
631
|
+
|
|
632
|
+
# Test Hybrid query type
|
|
633
|
+
hybrid_config = VectorStoreConfig(
|
|
634
|
+
name="hybrid_test",
|
|
635
|
+
store_type=VectorStoreType.LANCE_DB_HYBRID,
|
|
636
|
+
properties={
|
|
637
|
+
"similarity_top_k": 10,
|
|
638
|
+
"nprobes": 10,
|
|
639
|
+
"overfetch_factor": 10,
|
|
640
|
+
"vector_column_name": "vector",
|
|
641
|
+
"text_key": "text",
|
|
642
|
+
"doc_id_key": "doc_id",
|
|
643
|
+
},
|
|
644
|
+
)
|
|
645
|
+
rag_config = create_rag_config_factory(hybrid_config, embedding_config)
|
|
646
|
+
|
|
647
|
+
adapter = LanceDBAdapter(rag_config, hybrid_config)
|
|
648
|
+
assert adapter.query_type == "hybrid"
|
|
649
|
+
|
|
650
|
+
# Test Vector query type
|
|
651
|
+
vector_config = VectorStoreConfig(
|
|
652
|
+
name="vector_test",
|
|
653
|
+
store_type=VectorStoreType.LANCE_DB_VECTOR,
|
|
654
|
+
properties={
|
|
655
|
+
"similarity_top_k": 10,
|
|
656
|
+
"nprobes": 10,
|
|
657
|
+
"overfetch_factor": 10,
|
|
658
|
+
"vector_column_name": "vector",
|
|
659
|
+
"text_key": "text",
|
|
660
|
+
"doc_id_key": "doc_id",
|
|
661
|
+
},
|
|
662
|
+
)
|
|
663
|
+
rag_config = create_rag_config_factory(vector_config, embedding_config)
|
|
664
|
+
|
|
665
|
+
adapter = LanceDBAdapter(rag_config, vector_config)
|
|
666
|
+
assert adapter.query_type == "vector"
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
@pytest.mark.asyncio
|
|
670
|
+
async def test_adapter_reuse_preserves_data(
|
|
671
|
+
fts_vector_store_config,
|
|
672
|
+
mock_chunked_documents,
|
|
673
|
+
embedding_config,
|
|
674
|
+
create_rag_config_factory,
|
|
675
|
+
):
|
|
676
|
+
"""Test that creating the same LanceDBAdapter twice doesn't destroy/empty the db."""
|
|
677
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
678
|
+
|
|
679
|
+
# Create first adapter and add data
|
|
680
|
+
adapter1 = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
681
|
+
await adapter1.add_chunks_with_embeddings([mock_chunked_documents[0]])
|
|
682
|
+
|
|
683
|
+
# Verify data exists
|
|
684
|
+
count1 = await adapter1.count_records()
|
|
685
|
+
assert count1 == 4
|
|
686
|
+
|
|
687
|
+
# Create second adapter with same config
|
|
688
|
+
adapter2 = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
689
|
+
await adapter2.add_chunks_with_embeddings([mock_chunked_documents[1]])
|
|
690
|
+
|
|
691
|
+
# Verify data still exists and wasn't destroyed by second instantiation
|
|
692
|
+
count2 = await adapter2.count_records()
|
|
693
|
+
assert count2 == 8
|
|
694
|
+
|
|
695
|
+
# interesting: adapter1 is no longer usable after creating adapter2
|
|
696
|
+
# with pytest.raises(
|
|
697
|
+
# Exception,
|
|
698
|
+
# match="lance error: Retryable commit conflict for version 4: This CreateIndex transaction was preempted by concurrent transaction Rewrite at version 4. Please retry.",
|
|
699
|
+
# ):
|
|
700
|
+
await adapter1.search(VectorStoreQuery(query_string="Tokyo"))
|
|
701
|
+
|
|
702
|
+
# but we can query adapter2
|
|
703
|
+
results2 = await adapter2.search(VectorStoreQuery(query_string="Tokyo"))
|
|
704
|
+
assert len(results2) > 0
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
@pytest.mark.asyncio
|
|
708
|
+
async def test_skip_existing_chunks_when_count_matches(
|
|
709
|
+
fts_vector_store_config,
|
|
710
|
+
mock_chunked_documents,
|
|
711
|
+
embedding_config,
|
|
712
|
+
create_rag_config_factory,
|
|
713
|
+
):
|
|
714
|
+
"""Test that chunks already in DB are skipped when they match incoming chunks count."""
|
|
715
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
716
|
+
|
|
717
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
718
|
+
|
|
719
|
+
# Add first document
|
|
720
|
+
first_doc = [mock_chunked_documents[0]] # doc_001 with 4 chunks
|
|
721
|
+
await adapter.add_chunks_with_embeddings(first_doc)
|
|
722
|
+
|
|
723
|
+
# Verify it was added
|
|
724
|
+
count_after_first = await adapter.count_records()
|
|
725
|
+
assert count_after_first == 4
|
|
726
|
+
|
|
727
|
+
# Try to add the same document again - should be skipped
|
|
728
|
+
await adapter.add_chunks_with_embeddings(first_doc)
|
|
729
|
+
|
|
730
|
+
# Count should remain the same (chunks were skipped)
|
|
731
|
+
count_after_second = await adapter.count_records()
|
|
732
|
+
assert count_after_second == 4
|
|
733
|
+
|
|
734
|
+
# Verify the chunks are still there and retrievable
|
|
735
|
+
results = await adapter.search(VectorStoreQuery(query_string="Tokyo"))
|
|
736
|
+
assert len(results) > 0
|
|
737
|
+
assert "Tokyo" in results[0].chunk_text
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
@pytest.mark.asyncio
|
|
741
|
+
async def test_batching_functionality(
|
|
742
|
+
fts_vector_store_config,
|
|
743
|
+
embedding_config,
|
|
744
|
+
create_rag_config_factory,
|
|
745
|
+
tmp_path,
|
|
746
|
+
):
|
|
747
|
+
"""Test basic batching functionality in add_chunks_with_embeddings."""
|
|
748
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
749
|
+
|
|
750
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
751
|
+
|
|
752
|
+
# Create a document with many chunks to test batching
|
|
753
|
+
large_doc_data = {
|
|
754
|
+
"large_doc": [
|
|
755
|
+
{"vector": [i * 0.1, i * 0.2], "text": f"Chunk {i} content"}
|
|
756
|
+
for i in range(15) # 15 chunks to test batching
|
|
757
|
+
]
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
large_doc_records = dicts_to_indexable_docs(large_doc_data, tmp_path)
|
|
761
|
+
|
|
762
|
+
# Track batch sizes by patching the insert method
|
|
763
|
+
batch_sizes = []
|
|
764
|
+
|
|
765
|
+
async def mock_async_add(self, nodes, **kwargs):
|
|
766
|
+
batch_sizes.append(len(nodes))
|
|
767
|
+
return self.add(nodes, **kwargs)
|
|
768
|
+
|
|
769
|
+
# Patch the async_add method at the class level
|
|
770
|
+
with patch.object(
|
|
771
|
+
adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
|
|
772
|
+
):
|
|
773
|
+
# Add with small batch size to force batching
|
|
774
|
+
await adapter.add_chunks_with_embeddings(large_doc_records, nodes_batch_size=5)
|
|
775
|
+
|
|
776
|
+
# Verify batching behavior
|
|
777
|
+
# With 15 chunks and batch_size=5, we expect 3 batches of 5 chunks each
|
|
778
|
+
expected_batch_sizes = [5, 5, 5]
|
|
779
|
+
assert batch_sizes == expected_batch_sizes, (
|
|
780
|
+
f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# Verify all chunks were added
|
|
784
|
+
count = await adapter.count_records()
|
|
785
|
+
assert count == 15
|
|
786
|
+
|
|
787
|
+
# Verify we can search and find chunks
|
|
788
|
+
results = await adapter.search(VectorStoreQuery(query_string="Chunk"))
|
|
789
|
+
assert len(results) > 0 # Should find chunks containing "Chunk"
|
|
790
|
+
assert len(results) <= 15 # Should not exceed total number of chunks
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
@pytest.mark.asyncio
|
|
794
|
+
async def test_batching_functionality_with_remainder(
|
|
795
|
+
fts_vector_store_config,
|
|
796
|
+
embedding_config,
|
|
797
|
+
create_rag_config_factory,
|
|
798
|
+
tmp_path,
|
|
799
|
+
):
|
|
800
|
+
"""Test batching functionality with a remainder (not evenly divisible)."""
|
|
801
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
802
|
+
|
|
803
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
804
|
+
|
|
805
|
+
# Create a document with 17 chunks to test batching with remainder
|
|
806
|
+
large_doc_data = {
|
|
807
|
+
"large_doc": [
|
|
808
|
+
{"vector": [i * 0.1, i * 0.2], "text": f"Chunk {i} content"}
|
|
809
|
+
for i in range(17) # 17 chunks to test batching with remainder
|
|
810
|
+
]
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
large_doc_records = dicts_to_indexable_docs(large_doc_data, tmp_path)
|
|
814
|
+
|
|
815
|
+
# Track batch sizes by patching the insert method
|
|
816
|
+
batch_sizes = []
|
|
817
|
+
|
|
818
|
+
async def mock_async_add(self, nodes, **kwargs):
|
|
819
|
+
batch_sizes.append(len(nodes))
|
|
820
|
+
return self.add(nodes, **kwargs)
|
|
821
|
+
|
|
822
|
+
# Patch the async_add method at the class level
|
|
823
|
+
with patch.object(
|
|
824
|
+
adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
|
|
825
|
+
):
|
|
826
|
+
# Add with batch_size=7 to get 2 full batches + 1 remainder batch
|
|
827
|
+
await adapter.add_chunks_with_embeddings(large_doc_records, nodes_batch_size=7)
|
|
828
|
+
|
|
829
|
+
# Verify batching behavior
|
|
830
|
+
# With 17 chunks and batch_size=7, we expect 2 batches of 7 and 1 batch of 3
|
|
831
|
+
expected_batch_sizes = [7, 7, 3]
|
|
832
|
+
assert batch_sizes == expected_batch_sizes, (
|
|
833
|
+
f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Verify all chunks were added
|
|
837
|
+
count = await adapter.count_records()
|
|
838
|
+
assert count == 17
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
@pytest.mark.asyncio
|
|
842
|
+
async def test_batching_functionality_edge_cases(
|
|
843
|
+
fts_vector_store_config,
|
|
844
|
+
embedding_config,
|
|
845
|
+
create_rag_config_factory,
|
|
846
|
+
tmp_path,
|
|
847
|
+
):
|
|
848
|
+
"""Test batching functionality edge cases (small batches, single batch)."""
|
|
849
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
850
|
+
|
|
851
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
852
|
+
|
|
853
|
+
# Test 1: Single batch (3 chunks with batch_size=10)
|
|
854
|
+
small_doc_data = {
|
|
855
|
+
"small_doc": [
|
|
856
|
+
{"vector": [i * 0.1, i * 0.2], "text": f"Small chunk {i} content"}
|
|
857
|
+
for i in range(3)
|
|
858
|
+
]
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
small_doc_records = dicts_to_indexable_docs(small_doc_data, tmp_path)
|
|
862
|
+
|
|
863
|
+
# Track batch sizes by patching the insert method
|
|
864
|
+
batch_sizes = []
|
|
865
|
+
|
|
866
|
+
async def mock_async_add(self, nodes, **kwargs):
|
|
867
|
+
batch_sizes.append(len(nodes))
|
|
868
|
+
return self.add(nodes, **kwargs)
|
|
869
|
+
|
|
870
|
+
# Test single batch scenario
|
|
871
|
+
with patch.object(
|
|
872
|
+
adapter.lancedb_vector_store.__class__, "async_add", mock_async_add
|
|
873
|
+
):
|
|
874
|
+
await adapter.add_chunks_with_embeddings(small_doc_records, nodes_batch_size=10)
|
|
875
|
+
|
|
876
|
+
# With 3 chunks and batch_size=10, we expect 1 batch of 3 chunks
|
|
877
|
+
expected_batch_sizes = [3]
|
|
878
|
+
assert batch_sizes == expected_batch_sizes, (
|
|
879
|
+
f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
# Verify all chunks were added
|
|
883
|
+
count = await adapter.count_records()
|
|
884
|
+
assert count == 3
|
|
885
|
+
|
|
886
|
+
# Test 2: Very small batches (batch_size=1)
|
|
887
|
+
batch_sizes.clear() # Reset for next test
|
|
888
|
+
|
|
889
|
+
# Create new rag_config to get a fresh database
|
|
890
|
+
rag_config2 = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
891
|
+
adapter2 = LanceDBAdapter(rag_config2, fts_vector_store_config)
|
|
892
|
+
|
|
893
|
+
with patch.object(
|
|
894
|
+
adapter2.lancedb_vector_store.__class__, "async_add", mock_async_add
|
|
895
|
+
):
|
|
896
|
+
await adapter2.add_chunks_with_embeddings(small_doc_records, nodes_batch_size=1)
|
|
897
|
+
|
|
898
|
+
# With 3 chunks and batch_size=1, we expect 3 batches of 1 chunk each
|
|
899
|
+
expected_batch_sizes = [1, 1, 1]
|
|
900
|
+
assert batch_sizes == expected_batch_sizes, (
|
|
901
|
+
f"Expected batch sizes {expected_batch_sizes}, got {batch_sizes}"
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
@pytest.mark.asyncio
|
|
906
|
+
async def test_get_nodes_by_ids_functionality(
|
|
907
|
+
fts_vector_store_config,
|
|
908
|
+
mock_chunked_documents,
|
|
909
|
+
embedding_config,
|
|
910
|
+
create_rag_config_factory,
|
|
911
|
+
tmp_path,
|
|
912
|
+
):
|
|
913
|
+
"""Test get_nodes_by_ids method functionality."""
|
|
914
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
915
|
+
|
|
916
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
917
|
+
|
|
918
|
+
# before inserting data, we should simply return an empty list
|
|
919
|
+
retrieved_nodes_before_any_insert = await adapter.get_nodes_by_ids(
|
|
920
|
+
[str(uuid.uuid4()), str(uuid.uuid4())]
|
|
921
|
+
)
|
|
922
|
+
assert len(retrieved_nodes_before_any_insert) == 0
|
|
923
|
+
|
|
924
|
+
# Add some data
|
|
925
|
+
await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
|
|
926
|
+
|
|
927
|
+
# Test getting nodes by IDs - compute expected IDs
|
|
928
|
+
expected_ids = [
|
|
929
|
+
adapter.compute_deterministic_chunk_id("doc_001", i) for i in range(4)
|
|
930
|
+
]
|
|
931
|
+
|
|
932
|
+
# Get nodes by IDs
|
|
933
|
+
retrieved_nodes = await adapter.get_nodes_by_ids(expected_ids)
|
|
934
|
+
|
|
935
|
+
# Should retrieve all 4 nodes
|
|
936
|
+
assert len(retrieved_nodes) == 4
|
|
937
|
+
|
|
938
|
+
# Verify node properties
|
|
939
|
+
for i, node in enumerate(retrieved_nodes):
|
|
940
|
+
assert node.id_ == expected_ids[i]
|
|
941
|
+
assert node.metadata["kiln_doc_id"] == "doc_001"
|
|
942
|
+
assert node.metadata["kiln_chunk_idx"] == i
|
|
943
|
+
assert len(node.get_content()) > 0
|
|
944
|
+
|
|
945
|
+
# Test with non-existent IDs
|
|
946
|
+
fake_ids = [adapter.compute_deterministic_chunk_id("fake_doc", i) for i in range(2)]
|
|
947
|
+
retrieved_fake = await adapter.get_nodes_by_ids(fake_ids)
|
|
948
|
+
assert len(retrieved_fake) == 0
|
|
949
|
+
|
|
950
|
+
# Test with empty table (no table exists yet)
|
|
951
|
+
empty_rag_config = create_rag_config_factory(
|
|
952
|
+
fts_vector_store_config, embedding_config
|
|
953
|
+
)
|
|
954
|
+
empty_adapter = LanceDBAdapter(empty_rag_config, fts_vector_store_config)
|
|
955
|
+
empty_result = await empty_adapter.get_nodes_by_ids(expected_ids)
|
|
956
|
+
assert len(empty_result) == 0
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
@pytest.mark.asyncio
|
|
960
|
+
async def test_delete_nodes_by_document_id(
|
|
961
|
+
fts_vector_store_config,
|
|
962
|
+
mock_chunked_documents,
|
|
963
|
+
embedding_config,
|
|
964
|
+
create_rag_config_factory,
|
|
965
|
+
):
|
|
966
|
+
"""Test delete_nodes_by_document_id method."""
|
|
967
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
968
|
+
|
|
969
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
970
|
+
|
|
971
|
+
# Add both documents
|
|
972
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
973
|
+
|
|
974
|
+
# Verify both documents are there
|
|
975
|
+
count_before = await adapter.count_records()
|
|
976
|
+
assert count_before == 8 # 4 chunks per document
|
|
977
|
+
|
|
978
|
+
# Delete nodes for doc_001
|
|
979
|
+
await adapter.delete_nodes_by_document_id("doc_001")
|
|
980
|
+
|
|
981
|
+
# Verify doc_001 chunks are gone
|
|
982
|
+
count_after = await adapter.count_records()
|
|
983
|
+
assert count_after == 4 # Only doc_002 chunks remain
|
|
984
|
+
|
|
985
|
+
# Verify we can still find doc_002 chunks but not doc_001
|
|
986
|
+
results_doc2 = await adapter.search(VectorStoreQuery(query_string="area"))
|
|
987
|
+
assert len(results_doc2) > 0
|
|
988
|
+
|
|
989
|
+
# Try to search for population (which was in doc_001) - should find no results
|
|
990
|
+
# LanceDB raises a Warning when no results are found, so we catch it
|
|
991
|
+
try:
|
|
992
|
+
results_doc1 = await adapter.search(VectorStoreQuery(query_string="population"))
|
|
993
|
+
assert len(results_doc1) == 0
|
|
994
|
+
except Warning as w:
|
|
995
|
+
# This is expected - LanceDB raises a Warning for empty results
|
|
996
|
+
assert "query results are empty" in str(w)
|
|
997
|
+
|
|
998
|
+
# Try to delete non-existent document (should not error)
|
|
999
|
+
await adapter.delete_nodes_by_document_id("non_existent_doc")
|
|
1000
|
+
final_count = await adapter.count_records()
|
|
1001
|
+
assert final_count == 4 # Count unchanged
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
@pytest.mark.asyncio
|
|
1005
|
+
async def test_uuid_scheme_retrieval_and_node_properties(
|
|
1006
|
+
fts_vector_store_config,
|
|
1007
|
+
mock_chunked_documents,
|
|
1008
|
+
embedding_config,
|
|
1009
|
+
create_rag_config_factory,
|
|
1010
|
+
):
|
|
1011
|
+
"""Test UUID scheme retrieval and that inserted nodes have correct ID and ref_doc_id."""
|
|
1012
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1013
|
+
|
|
1014
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1015
|
+
|
|
1016
|
+
# Add first document
|
|
1017
|
+
await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
|
|
1018
|
+
|
|
1019
|
+
# Test the UUID scheme: document_id::chunk_idx
|
|
1020
|
+
for chunk_idx in range(4):
|
|
1021
|
+
# Compute expected ID using the same scheme as the adapter
|
|
1022
|
+
expected_id = adapter.compute_deterministic_chunk_id("doc_001", chunk_idx)
|
|
1023
|
+
|
|
1024
|
+
# Retrieve the specific node by ID
|
|
1025
|
+
retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
|
|
1026
|
+
assert len(retrieved_nodes) == 1
|
|
1027
|
+
|
|
1028
|
+
node = retrieved_nodes[0]
|
|
1029
|
+
|
|
1030
|
+
# Test that inserted nodes have the expected ID we set
|
|
1031
|
+
assert node.id_ == expected_id
|
|
1032
|
+
|
|
1033
|
+
# Test that inserted nodes have ref_doc_id set correctly
|
|
1034
|
+
# The ref_doc_id should be set through the SOURCE relationship
|
|
1035
|
+
source_relationship = node.relationships.get(NodeRelationship.SOURCE)
|
|
1036
|
+
assert source_relationship is not None
|
|
1037
|
+
# Handle both single RelatedNodeInfo and list of RelatedNodeInfo
|
|
1038
|
+
if isinstance(source_relationship, list):
|
|
1039
|
+
assert len(source_relationship) > 0
|
|
1040
|
+
assert source_relationship[0].node_id == "doc_001"
|
|
1041
|
+
else:
|
|
1042
|
+
assert source_relationship.node_id == "doc_001"
|
|
1043
|
+
|
|
1044
|
+
# Verify other node properties
|
|
1045
|
+
assert node.metadata["kiln_doc_id"] == "doc_001"
|
|
1046
|
+
assert node.metadata["kiln_chunk_idx"] == chunk_idx
|
|
1047
|
+
assert len(node.get_content()) > 0
|
|
1048
|
+
assert node.embedding is not None
|
|
1049
|
+
assert len(node.embedding) == 2 # Our test embeddings are 2D
|
|
1050
|
+
|
|
1051
|
+
# Test with a different document to ensure the scheme works consistently
|
|
1052
|
+
await adapter.add_chunks_with_embeddings([mock_chunked_documents[1]]) # doc_002
|
|
1053
|
+
|
|
1054
|
+
# Test retrieval of doc_002 chunks
|
|
1055
|
+
for chunk_idx in range(4):
|
|
1056
|
+
expected_id = adapter.compute_deterministic_chunk_id("doc_002", chunk_idx)
|
|
1057
|
+
retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
|
|
1058
|
+
assert len(retrieved_nodes) == 1
|
|
1059
|
+
|
|
1060
|
+
node = retrieved_nodes[0]
|
|
1061
|
+
assert node.id_ == expected_id
|
|
1062
|
+
assert node.metadata["kiln_doc_id"] == "doc_002"
|
|
1063
|
+
assert node.metadata["kiln_chunk_idx"] == chunk_idx
|
|
1064
|
+
|
|
1065
|
+
# Check ref_doc_id relationship
|
|
1066
|
+
source_relationship = node.relationships.get(NodeRelationship.SOURCE)
|
|
1067
|
+
assert source_relationship is not None
|
|
1068
|
+
# Handle both single RelatedNodeInfo and list of RelatedNodeInfo
|
|
1069
|
+
if isinstance(source_relationship, list):
|
|
1070
|
+
assert len(source_relationship) > 0
|
|
1071
|
+
assert source_relationship[0].node_id == "doc_002"
|
|
1072
|
+
else:
|
|
1073
|
+
assert source_relationship.node_id == "doc_002"
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
@pytest.mark.asyncio
|
|
1077
|
+
async def test_deterministic_chunk_id_consistency(
|
|
1078
|
+
fts_vector_store_config,
|
|
1079
|
+
embedding_config,
|
|
1080
|
+
create_rag_config_factory,
|
|
1081
|
+
):
|
|
1082
|
+
"""Test that the deterministic chunk ID generation is consistent."""
|
|
1083
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1084
|
+
|
|
1085
|
+
adapter = LanceDBAdapter(
|
|
1086
|
+
rag_config,
|
|
1087
|
+
fts_vector_store_config,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
# Test that the same document_id and chunk_idx always produce the same UUID
|
|
1091
|
+
doc_id = "test_doc_123"
|
|
1092
|
+
chunk_idx = 5
|
|
1093
|
+
|
|
1094
|
+
id1 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
|
|
1095
|
+
id2 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
|
|
1096
|
+
|
|
1097
|
+
assert id1 == id2
|
|
1098
|
+
|
|
1099
|
+
# Test that different inputs produce different UUIDs
|
|
1100
|
+
id3 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx + 1)
|
|
1101
|
+
id4 = adapter.compute_deterministic_chunk_id(doc_id + "_different", chunk_idx)
|
|
1102
|
+
|
|
1103
|
+
assert id1 != id3
|
|
1104
|
+
assert id1 != id4
|
|
1105
|
+
assert id3 != id4
|
|
1106
|
+
|
|
1107
|
+
# Verify the format is a valid UUID string
|
|
1108
|
+
import uuid
|
|
1109
|
+
|
|
1110
|
+
try:
|
|
1111
|
+
uuid.UUID(id1) # Should not raise an exception
|
|
1112
|
+
uuid.UUID(id3)
|
|
1113
|
+
uuid.UUID(id4)
|
|
1114
|
+
except ValueError:
|
|
1115
|
+
pytest.fail("Generated IDs are not valid UUIDs")
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
@pytest.mark.asyncio
|
|
1119
|
+
async def test_chunk_replacement_triggers_deletion(
|
|
1120
|
+
fts_vector_store_config,
|
|
1121
|
+
embedding_config,
|
|
1122
|
+
create_rag_config_factory,
|
|
1123
|
+
tmp_path,
|
|
1124
|
+
):
|
|
1125
|
+
"""Test that adding different chunks for the same document triggers deletion of old chunks."""
|
|
1126
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1127
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1128
|
+
|
|
1129
|
+
# Create initial document with 2 chunks
|
|
1130
|
+
initial_doc_data = {
|
|
1131
|
+
"test_doc": [
|
|
1132
|
+
{"vector": [1.0, 1.0], "text": "Initial chunk 1"},
|
|
1133
|
+
{"vector": [2.0, 2.0], "text": "Initial chunk 2"},
|
|
1134
|
+
]
|
|
1135
|
+
}
|
|
1136
|
+
initial_records = dicts_to_indexable_docs(initial_doc_data, tmp_path)
|
|
1137
|
+
|
|
1138
|
+
# Add initial chunks
|
|
1139
|
+
await adapter.add_chunks_with_embeddings(initial_records)
|
|
1140
|
+
|
|
1141
|
+
# Verify initial chunks are there
|
|
1142
|
+
initial_count = await adapter.count_records()
|
|
1143
|
+
assert initial_count == 2
|
|
1144
|
+
|
|
1145
|
+
# Create modified document with 3 different chunks (more chunks than original)
|
|
1146
|
+
# This will trigger deletion because len(chunk_ids_in_database) != chunk_count_for_document (2 != 3)
|
|
1147
|
+
modified_doc_data = {
|
|
1148
|
+
"test_doc": [
|
|
1149
|
+
{"vector": [10.0, 10.0], "text": "Modified chunk 1"},
|
|
1150
|
+
{"vector": [20.0, 20.0], "text": "Modified chunk 2"},
|
|
1151
|
+
{"vector": [30.0, 30.0], "text": "Modified chunk 3"},
|
|
1152
|
+
]
|
|
1153
|
+
}
|
|
1154
|
+
modified_records = dicts_to_indexable_docs(modified_doc_data, tmp_path)
|
|
1155
|
+
|
|
1156
|
+
# Mock the delete_nodes_by_document_id method to verify it gets called
|
|
1157
|
+
delete_called = []
|
|
1158
|
+
original_delete = adapter.delete_nodes_by_document_id
|
|
1159
|
+
|
|
1160
|
+
async def mock_delete(document_id: str):
|
|
1161
|
+
delete_called.append(document_id)
|
|
1162
|
+
return await original_delete(document_id)
|
|
1163
|
+
|
|
1164
|
+
adapter.delete_nodes_by_document_id = mock_delete
|
|
1165
|
+
|
|
1166
|
+
# Add modified chunks - this should trigger deletion of old chunks
|
|
1167
|
+
await adapter.add_chunks_with_embeddings(modified_records)
|
|
1168
|
+
|
|
1169
|
+
# Verify delete was called for the document
|
|
1170
|
+
assert "test_doc" in delete_called
|
|
1171
|
+
|
|
1172
|
+
# Verify final count is correct (only 2 new chunks)
|
|
1173
|
+
final_count = await adapter.count_records()
|
|
1174
|
+
assert final_count == 3
|
|
1175
|
+
|
|
1176
|
+
# Verify the chunks are the new ones, not the old ones
|
|
1177
|
+
results = await adapter.search(VectorStoreQuery(query_string="Modified"))
|
|
1178
|
+
assert len(results) == 3
|
|
1179
|
+
assert all("Modified" in result.chunk_text for result in results)
|
|
1180
|
+
|
|
1181
|
+
# Verify old chunks are gone - LanceDB raises a Warning for empty results
|
|
1182
|
+
try:
|
|
1183
|
+
old_results = await adapter.search(VectorStoreQuery(query_string="Initial"))
|
|
1184
|
+
assert len(old_results) == 0
|
|
1185
|
+
except Warning as w:
|
|
1186
|
+
# This is expected - LanceDB raises a Warning for empty results
|
|
1187
|
+
assert "query results are empty" in str(w)
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
@pytest.mark.asyncio
|
|
1191
|
+
async def test_chunk_deletion_ensures_complete_cleanup_and_other_docs_unaffected(
|
|
1192
|
+
fts_vector_store_config,
|
|
1193
|
+
embedding_config,
|
|
1194
|
+
create_rag_config_factory,
|
|
1195
|
+
tmp_path,
|
|
1196
|
+
):
|
|
1197
|
+
"""Test that deletion completely cleans up all old chunks and other documents are unaffected."""
|
|
1198
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1199
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1200
|
+
|
|
1201
|
+
# Create initial document with 5 chunks
|
|
1202
|
+
initial_doc_data = {
|
|
1203
|
+
"target_doc": [
|
|
1204
|
+
{"vector": [1.0, 1.0], "text": "Original chunk 1"},
|
|
1205
|
+
{"vector": [2.0, 2.0], "text": "Original chunk 2"},
|
|
1206
|
+
{"vector": [3.0, 3.0], "text": "Original chunk 3"},
|
|
1207
|
+
{"vector": [4.0, 4.0], "text": "Original chunk 4"},
|
|
1208
|
+
{"vector": [5.0, 5.0], "text": "Original chunk 5"},
|
|
1209
|
+
]
|
|
1210
|
+
}
|
|
1211
|
+
initial_records = dicts_to_indexable_docs(initial_doc_data, tmp_path)
|
|
1212
|
+
|
|
1213
|
+
# Create another document that should remain unaffected
|
|
1214
|
+
other_doc_data = {
|
|
1215
|
+
"other_doc": [
|
|
1216
|
+
{"vector": [10.0, 10.0], "text": "Other doc chunk 1"},
|
|
1217
|
+
{"vector": [20.0, 20.0], "text": "Other doc chunk 2"},
|
|
1218
|
+
{"vector": [30.0, 30.0], "text": "Other doc chunk 3"},
|
|
1219
|
+
]
|
|
1220
|
+
}
|
|
1221
|
+
other_records = dicts_to_indexable_docs(other_doc_data, tmp_path)
|
|
1222
|
+
|
|
1223
|
+
# Add both documents
|
|
1224
|
+
await adapter.add_chunks_with_embeddings(initial_records)
|
|
1225
|
+
await adapter.add_chunks_with_embeddings(other_records)
|
|
1226
|
+
|
|
1227
|
+
# Verify both documents are there (5 + 3 = 8 chunks)
|
|
1228
|
+
initial_count = await adapter.count_records()
|
|
1229
|
+
assert initial_count == 8
|
|
1230
|
+
|
|
1231
|
+
# Verify we can find chunks from both documents
|
|
1232
|
+
target_results = await adapter.search(VectorStoreQuery(query_string="Original"))
|
|
1233
|
+
assert len(target_results) == 5
|
|
1234
|
+
|
|
1235
|
+
other_results = await adapter.search(VectorStoreQuery(query_string="Other"))
|
|
1236
|
+
assert len(other_results) == 3
|
|
1237
|
+
|
|
1238
|
+
# Create modified target document with 7 chunks (more than the original 5)
|
|
1239
|
+
# This will trigger deletion because len(chunk_ids_in_database) != chunk_count_for_document (5 != 7)
|
|
1240
|
+
# After deletion, we'll have 7 new chunks, demonstrating that the old 5 chunks were completely removed
|
|
1241
|
+
modified_doc_data = {
|
|
1242
|
+
"target_doc": [
|
|
1243
|
+
{"vector": [100.0, 100.0], "text": "New target chunk 1"},
|
|
1244
|
+
{"vector": [200.0, 200.0], "text": "New target chunk 2"},
|
|
1245
|
+
{"vector": [300.0, 300.0], "text": "New target chunk 3"},
|
|
1246
|
+
{"vector": [400.0, 400.0], "text": "New target chunk 4"},
|
|
1247
|
+
{"vector": [500.0, 500.0], "text": "New target chunk 5"},
|
|
1248
|
+
{"vector": [600.0, 600.0], "text": "New target chunk 6"},
|
|
1249
|
+
{"vector": [700.0, 700.0], "text": "New target chunk 7"},
|
|
1250
|
+
]
|
|
1251
|
+
}
|
|
1252
|
+
modified_records = dicts_to_indexable_docs(modified_doc_data, tmp_path)
|
|
1253
|
+
|
|
1254
|
+
# Mock the delete_nodes_by_document_id method to verify it gets called
|
|
1255
|
+
delete_called = []
|
|
1256
|
+
original_delete = adapter.delete_nodes_by_document_id
|
|
1257
|
+
|
|
1258
|
+
async def mock_delete(document_id: str):
|
|
1259
|
+
delete_called.append(document_id)
|
|
1260
|
+
return await original_delete(document_id)
|
|
1261
|
+
|
|
1262
|
+
adapter.delete_nodes_by_document_id = mock_delete
|
|
1263
|
+
|
|
1264
|
+
# Add modified chunks - this should trigger deletion of old target_doc chunks only
|
|
1265
|
+
await adapter.add_chunks_with_embeddings(modified_records)
|
|
1266
|
+
|
|
1267
|
+
# Verify delete was called for the target document only
|
|
1268
|
+
assert "target_doc" in delete_called
|
|
1269
|
+
assert "other_doc" not in delete_called
|
|
1270
|
+
|
|
1271
|
+
# Verify final count: 7 new target chunks + 3 other chunks = 10 total
|
|
1272
|
+
final_count = await adapter.count_records()
|
|
1273
|
+
assert final_count == 10
|
|
1274
|
+
|
|
1275
|
+
# Verify the target document now has the new chunks
|
|
1276
|
+
new_target_results = await adapter.search(
|
|
1277
|
+
VectorStoreQuery(query_string="New target")
|
|
1278
|
+
)
|
|
1279
|
+
assert len(new_target_results) == 7
|
|
1280
|
+
assert all("New target" in result.chunk_text for result in new_target_results)
|
|
1281
|
+
|
|
1282
|
+
# Verify old target chunks are completely gone
|
|
1283
|
+
try:
|
|
1284
|
+
old_target_results = await adapter.search(
|
|
1285
|
+
VectorStoreQuery(query_string="Original")
|
|
1286
|
+
)
|
|
1287
|
+
# Should find no results since "Original" was only in the old chunks
|
|
1288
|
+
assert len(old_target_results) == 0
|
|
1289
|
+
except Warning as w:
|
|
1290
|
+
# This is expected - LanceDB raises a Warning for empty results
|
|
1291
|
+
assert "query results are empty" in str(w)
|
|
1292
|
+
|
|
1293
|
+
# Verify other document is completely unaffected
|
|
1294
|
+
final_other_results = await adapter.search(VectorStoreQuery(query_string="Other"))
|
|
1295
|
+
assert len(final_other_results) == 3
|
|
1296
|
+
assert all("Other doc" in result.chunk_text for result in final_other_results)
|
|
1297
|
+
|
|
1298
|
+
# Verify all other document chunks still have the same content
|
|
1299
|
+
other_texts = [result.chunk_text for result in final_other_results]
|
|
1300
|
+
expected_other_texts = [
|
|
1301
|
+
"Other doc chunk 1",
|
|
1302
|
+
"Other doc chunk 2",
|
|
1303
|
+
"Other doc chunk 3",
|
|
1304
|
+
]
|
|
1305
|
+
for expected_text in expected_other_texts:
|
|
1306
|
+
assert any(expected_text in text for text in other_texts)
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
@pytest.mark.asyncio
|
|
1310
|
+
async def test_delete_nodes_by_document_id_direct(
|
|
1311
|
+
fts_vector_store_config,
|
|
1312
|
+
embedding_config,
|
|
1313
|
+
create_rag_config_factory,
|
|
1314
|
+
tmp_path,
|
|
1315
|
+
):
|
|
1316
|
+
"""Test delete_nodes_by_document_id method directly."""
|
|
1317
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1318
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1319
|
+
|
|
1320
|
+
# Create two documents with multiple chunks each
|
|
1321
|
+
doc1_data = {
|
|
1322
|
+
"document_1": [
|
|
1323
|
+
{"vector": [1.0, 1.0], "text": "Alpha content part 1"},
|
|
1324
|
+
{"vector": [2.0, 2.0], "text": "Alpha content part 2"},
|
|
1325
|
+
{"vector": [3.0, 3.0], "text": "Alpha content part 3"},
|
|
1326
|
+
]
|
|
1327
|
+
}
|
|
1328
|
+
doc1_records = dicts_to_indexable_docs(doc1_data, tmp_path)
|
|
1329
|
+
|
|
1330
|
+
doc2_data = {
|
|
1331
|
+
"document_2": [
|
|
1332
|
+
{"vector": [10.0, 10.0], "text": "Beta content section 1"},
|
|
1333
|
+
{"vector": [20.0, 20.0], "text": "Beta content section 2"},
|
|
1334
|
+
]
|
|
1335
|
+
}
|
|
1336
|
+
doc2_records = dicts_to_indexable_docs(doc2_data, tmp_path)
|
|
1337
|
+
|
|
1338
|
+
# Add both documents
|
|
1339
|
+
await adapter.add_chunks_with_embeddings(doc1_records)
|
|
1340
|
+
await adapter.add_chunks_with_embeddings(doc2_records)
|
|
1341
|
+
|
|
1342
|
+
# Verify both documents are in the database (3 + 2 = 5 chunks)
|
|
1343
|
+
initial_count = await adapter.count_records()
|
|
1344
|
+
assert initial_count == 5
|
|
1345
|
+
|
|
1346
|
+
# Verify we can find chunks from both documents
|
|
1347
|
+
doc1_results = await adapter.search(VectorStoreQuery(query_string="Alpha"))
|
|
1348
|
+
assert len(doc1_results) == 3
|
|
1349
|
+
|
|
1350
|
+
doc2_results = await adapter.search(VectorStoreQuery(query_string="Beta"))
|
|
1351
|
+
assert len(doc2_results) == 2
|
|
1352
|
+
|
|
1353
|
+
# Test deleting document_1 chunks using delete_nodes_by_document_id
|
|
1354
|
+
await adapter.delete_nodes_by_document_id("document_1")
|
|
1355
|
+
|
|
1356
|
+
# Verify document_1 chunks are gone
|
|
1357
|
+
count_after_delete = await adapter.count_records()
|
|
1358
|
+
assert count_after_delete == 2 # Only document_2 chunks remain
|
|
1359
|
+
|
|
1360
|
+
# Verify document_1 chunks are no longer searchable
|
|
1361
|
+
try:
|
|
1362
|
+
doc1_results_after = await adapter.search(
|
|
1363
|
+
VectorStoreQuery(query_string="Alpha")
|
|
1364
|
+
)
|
|
1365
|
+
assert len(doc1_results_after) == 0
|
|
1366
|
+
except Warning as w:
|
|
1367
|
+
# LanceDB raises a Warning for empty results
|
|
1368
|
+
assert "query results are empty" in str(w)
|
|
1369
|
+
|
|
1370
|
+
# Verify document_2 chunks are still there and unaffected
|
|
1371
|
+
doc2_results_after = await adapter.search(VectorStoreQuery(query_string="Beta"))
|
|
1372
|
+
assert len(doc2_results_after) == 2
|
|
1373
|
+
assert all("Beta" in result.chunk_text for result in doc2_results_after)
|
|
1374
|
+
|
|
1375
|
+
# Test deleting the remaining document
|
|
1376
|
+
await adapter.delete_nodes_by_document_id("document_2")
|
|
1377
|
+
|
|
1378
|
+
# Verify all chunks are gone
|
|
1379
|
+
final_count = await adapter.count_records()
|
|
1380
|
+
assert final_count == 0
|
|
1381
|
+
|
|
1382
|
+
# Test deleting from non-existent document (should not error)
|
|
1383
|
+
await adapter.delete_nodes_by_document_id("non_existent_document")
|
|
1384
|
+
|
|
1385
|
+
# Count should still be 0
|
|
1386
|
+
count_after_non_existent = await adapter.count_records()
|
|
1387
|
+
assert count_after_non_existent == 0
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
@pytest.mark.asyncio
|
|
1391
|
+
async def test_delete_nodes_by_document_id_empty_table(
|
|
1392
|
+
fts_vector_store_config,
|
|
1393
|
+
embedding_config,
|
|
1394
|
+
create_rag_config_factory,
|
|
1395
|
+
):
|
|
1396
|
+
"""Test delete_nodes_by_document_id on empty/non-existent table."""
|
|
1397
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1398
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1399
|
+
|
|
1400
|
+
# Test deleting from empty table (should not error due to TableNotFoundError handling)
|
|
1401
|
+
await adapter.delete_nodes_by_document_id("some_document_id")
|
|
1402
|
+
|
|
1403
|
+
# Verify count is still 0
|
|
1404
|
+
count = await adapter.count_records()
|
|
1405
|
+
assert count == 0
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def generate_benchmark_data(
|
|
1409
|
+
doc_count: int,
|
|
1410
|
+
chunks_per_doc: int,
|
|
1411
|
+
vector_size: int,
|
|
1412
|
+
word_count: int,
|
|
1413
|
+
tmp_path: Path,
|
|
1414
|
+
) -> list[DocumentWithChunksAndEmbeddings]:
|
|
1415
|
+
"""Generate random data for benchmarking."""
|
|
1416
|
+
|
|
1417
|
+
def generate_word_pool(target_size: int) -> list[str]:
|
|
1418
|
+
"""Generate a pool of random words using common prefixes, roots, and suffixes."""
|
|
1419
|
+
prefixes = [
|
|
1420
|
+
"pre",
|
|
1421
|
+
"un",
|
|
1422
|
+
"re",
|
|
1423
|
+
"in",
|
|
1424
|
+
"dis",
|
|
1425
|
+
"en",
|
|
1426
|
+
"non",
|
|
1427
|
+
"over",
|
|
1428
|
+
"mis",
|
|
1429
|
+
"sub",
|
|
1430
|
+
"inter",
|
|
1431
|
+
"super",
|
|
1432
|
+
"anti",
|
|
1433
|
+
"semi",
|
|
1434
|
+
"multi",
|
|
1435
|
+
"auto",
|
|
1436
|
+
"co",
|
|
1437
|
+
"de",
|
|
1438
|
+
"ex",
|
|
1439
|
+
"pro",
|
|
1440
|
+
]
|
|
1441
|
+
roots = [
|
|
1442
|
+
"act",
|
|
1443
|
+
"form",
|
|
1444
|
+
"port",
|
|
1445
|
+
"dict",
|
|
1446
|
+
"ject",
|
|
1447
|
+
"rupt",
|
|
1448
|
+
"scrib",
|
|
1449
|
+
"struct",
|
|
1450
|
+
"tract",
|
|
1451
|
+
"vert",
|
|
1452
|
+
"vis",
|
|
1453
|
+
"spect",
|
|
1454
|
+
"mit",
|
|
1455
|
+
"duc",
|
|
1456
|
+
"fac",
|
|
1457
|
+
"cap",
|
|
1458
|
+
"cred",
|
|
1459
|
+
"grad",
|
|
1460
|
+
"loc",
|
|
1461
|
+
"mov",
|
|
1462
|
+
"ped",
|
|
1463
|
+
"pend",
|
|
1464
|
+
"pos",
|
|
1465
|
+
"sect",
|
|
1466
|
+
"sent",
|
|
1467
|
+
"serv",
|
|
1468
|
+
"sign",
|
|
1469
|
+
"sist",
|
|
1470
|
+
"spec",
|
|
1471
|
+
"tain",
|
|
1472
|
+
"temp",
|
|
1473
|
+
"tend",
|
|
1474
|
+
"terr",
|
|
1475
|
+
"test",
|
|
1476
|
+
"text",
|
|
1477
|
+
"tort",
|
|
1478
|
+
"typ",
|
|
1479
|
+
"urb",
|
|
1480
|
+
"vac",
|
|
1481
|
+
"val",
|
|
1482
|
+
"ven",
|
|
1483
|
+
"vers",
|
|
1484
|
+
"vid",
|
|
1485
|
+
"voc",
|
|
1486
|
+
"volv",
|
|
1487
|
+
]
|
|
1488
|
+
suffixes = [
|
|
1489
|
+
"tion",
|
|
1490
|
+
"sion",
|
|
1491
|
+
"ness",
|
|
1492
|
+
"ment",
|
|
1493
|
+
"able",
|
|
1494
|
+
"ible",
|
|
1495
|
+
"ful",
|
|
1496
|
+
"less",
|
|
1497
|
+
"ing",
|
|
1498
|
+
"ed",
|
|
1499
|
+
"er",
|
|
1500
|
+
"est",
|
|
1501
|
+
"ly",
|
|
1502
|
+
"ity",
|
|
1503
|
+
"ous",
|
|
1504
|
+
"ive",
|
|
1505
|
+
"al",
|
|
1506
|
+
"ic",
|
|
1507
|
+
"ical",
|
|
1508
|
+
"ary",
|
|
1509
|
+
"ory",
|
|
1510
|
+
"ure",
|
|
1511
|
+
"ade",
|
|
1512
|
+
"age",
|
|
1513
|
+
"ance",
|
|
1514
|
+
"ence",
|
|
1515
|
+
"dom",
|
|
1516
|
+
"hood",
|
|
1517
|
+
"ship",
|
|
1518
|
+
"ward",
|
|
1519
|
+
"wise",
|
|
1520
|
+
"like",
|
|
1521
|
+
"some",
|
|
1522
|
+
"teen",
|
|
1523
|
+
"ty",
|
|
1524
|
+
"th",
|
|
1525
|
+
"ish",
|
|
1526
|
+
"esque",
|
|
1527
|
+
]
|
|
1528
|
+
|
|
1529
|
+
words = set()
|
|
1530
|
+
|
|
1531
|
+
# Generate combinations
|
|
1532
|
+
while len(words) < target_size:
|
|
1533
|
+
# Simple root words
|
|
1534
|
+
if random.random() < 0.3:
|
|
1535
|
+
words.add(random.choice(roots))
|
|
1536
|
+
# Prefix + root
|
|
1537
|
+
elif random.random() < 0.6:
|
|
1538
|
+
words.add(random.choice(prefixes) + random.choice(roots))
|
|
1539
|
+
# Root + suffix
|
|
1540
|
+
elif random.random() < 0.8:
|
|
1541
|
+
words.add(random.choice(roots) + random.choice(suffixes))
|
|
1542
|
+
# Prefix + root + suffix
|
|
1543
|
+
else:
|
|
1544
|
+
words.add(
|
|
1545
|
+
random.choice(prefixes)
|
|
1546
|
+
+ random.choice(roots)
|
|
1547
|
+
+ random.choice(suffixes)
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
return list(words)
|
|
1551
|
+
|
|
1552
|
+
# Generate word pool that's ~25x the word_count for variety
|
|
1553
|
+
target_pool_size = max(
|
|
1554
|
+
word_count * 25, 100
|
|
1555
|
+
) # At least 100 words, scale dictionary with word_count*25
|
|
1556
|
+
words = generate_word_pool(target_pool_size)
|
|
1557
|
+
|
|
1558
|
+
results = []
|
|
1559
|
+
for i in range(doc_count):
|
|
1560
|
+
doc_id = f"doc_{i:05d}"
|
|
1561
|
+
|
|
1562
|
+
# Generate random text (word_count words) - allow repetition for variety
|
|
1563
|
+
selected_words = random.choices(words, k=word_count)
|
|
1564
|
+
text_content = " ".join(selected_words)
|
|
1565
|
+
|
|
1566
|
+
# Generate random vector_size-dimensional vector
|
|
1567
|
+
vector = [random.uniform(-1.0, 1.0) for _ in range(vector_size)]
|
|
1568
|
+
|
|
1569
|
+
# Create chunked document with single chunk
|
|
1570
|
+
chunked_document = ChunkedDocument(
|
|
1571
|
+
chunker_config_id="test_chunker",
|
|
1572
|
+
chunks=[
|
|
1573
|
+
Chunk(content=KilnAttachmentModel.from_data(text_content, "text/plain"))
|
|
1574
|
+
for _ in range(chunks_per_doc)
|
|
1575
|
+
],
|
|
1576
|
+
path=tmp_path / f"chunked_document_{i}.kiln",
|
|
1577
|
+
)
|
|
1578
|
+
|
|
1579
|
+
# Create chunk embeddings
|
|
1580
|
+
chunk_embeddings = ChunkEmbeddings(
|
|
1581
|
+
embedding_config_id="test_embedding",
|
|
1582
|
+
embeddings=[Embedding(vector=vector) for _ in range(chunks_per_doc)],
|
|
1583
|
+
path=tmp_path / f"chunk_embeddings_{i}.kiln",
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
results.append(
|
|
1587
|
+
DocumentWithChunksAndEmbeddings(
|
|
1588
|
+
document_id=doc_id,
|
|
1589
|
+
chunked_document=chunked_document,
|
|
1590
|
+
chunk_embeddings=chunk_embeddings,
|
|
1591
|
+
)
|
|
1592
|
+
)
|
|
1593
|
+
|
|
1594
|
+
return results
|
|
1595
|
+
|
|
1596
|
+
|
|
1597
|
+
@pytest.mark.benchmark
|
|
1598
|
+
# Not actually paid, but we want the "must be run manually" feature of the paid marker as this is very slow
|
|
1599
|
+
@pytest.mark.paid
|
|
1600
|
+
def test_benchmark_add_chunks(
|
|
1601
|
+
benchmark,
|
|
1602
|
+
hybrid_vector_store_config,
|
|
1603
|
+
embedding_config,
|
|
1604
|
+
create_rag_config_factory,
|
|
1605
|
+
tmp_path,
|
|
1606
|
+
):
|
|
1607
|
+
"""Benchmark adding chunks with embeddings to LanceDB."""
|
|
1608
|
+
|
|
1609
|
+
doc_count = 1000
|
|
1610
|
+
chunks_per_doc = 50
|
|
1611
|
+
vector_size = 1024
|
|
1612
|
+
word_count = 200
|
|
1613
|
+
|
|
1614
|
+
# Set random seed for reproducible results
|
|
1615
|
+
random.seed(42)
|
|
1616
|
+
|
|
1617
|
+
# Generate random data items (this is not benchmarked)
|
|
1618
|
+
benchmark_data = generate_benchmark_data(
|
|
1619
|
+
doc_count, chunks_per_doc, vector_size, word_count, tmp_path
|
|
1620
|
+
)
|
|
1621
|
+
|
|
1622
|
+
# Create RAG config and adapter (not benchmarked)
|
|
1623
|
+
rag_config = create_rag_config_factory(hybrid_vector_store_config, embedding_config)
|
|
1624
|
+
adapter = asyncio.run(
|
|
1625
|
+
vector_store_adapter_for_config(rag_config, hybrid_vector_store_config)
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
# Benchmark only the index loading
|
|
1629
|
+
def add_chunks():
|
|
1630
|
+
return asyncio.run(adapter.add_chunks_with_embeddings(benchmark_data))
|
|
1631
|
+
|
|
1632
|
+
# one iteration
|
|
1633
|
+
benchmark.pedantic(add_chunks, rounds=1, iterations=1)
|
|
1634
|
+
stats = benchmark.stats.stats
|
|
1635
|
+
|
|
1636
|
+
# Verify that data was actually added
|
|
1637
|
+
async def verify_count():
|
|
1638
|
+
final_count = await adapter.count_records()
|
|
1639
|
+
return final_count
|
|
1640
|
+
|
|
1641
|
+
final_count = asyncio.run(verify_count())
|
|
1642
|
+
assert final_count == doc_count * chunks_per_doc, (
|
|
1643
|
+
f"Expected {doc_count} records, got {final_count}"
|
|
1644
|
+
)
|
|
1645
|
+
|
|
1646
|
+
# Expect min 2500 ops per second
|
|
1647
|
+
max_time = (doc_count * chunks_per_doc) / 2500
|
|
1648
|
+
if stats.max > max_time:
|
|
1649
|
+
pytest.fail(
|
|
1650
|
+
f"Average time per iteration: {stats.mean:.4f}s, expected less than {max_time:.4f}s"
|
|
1651
|
+
)
|
|
1652
|
+
|
|
1653
|
+
|
|
1654
|
+
@pytest.mark.asyncio
|
|
1655
|
+
async def test_delete_nodes_not_in_set_basic_functionality(
|
|
1656
|
+
fts_vector_store_config,
|
|
1657
|
+
mock_chunked_documents,
|
|
1658
|
+
embedding_config,
|
|
1659
|
+
create_rag_config_factory,
|
|
1660
|
+
):
|
|
1661
|
+
"""Test basic functionality of delete_nodes_not_in_set - keep some docs, delete others."""
|
|
1662
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1663
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1664
|
+
|
|
1665
|
+
# Add both documents (doc_001 and doc_002)
|
|
1666
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
1667
|
+
|
|
1668
|
+
# Verify both documents are there (4 chunks each = 8 total)
|
|
1669
|
+
initial_count = await adapter.count_records()
|
|
1670
|
+
assert initial_count == 8
|
|
1671
|
+
|
|
1672
|
+
# Keep only doc_001, delete doc_002
|
|
1673
|
+
keep_set = {"doc_001"}
|
|
1674
|
+
await adapter.delete_nodes_not_in_set(keep_set)
|
|
1675
|
+
|
|
1676
|
+
# Verify only doc_001 chunks remain
|
|
1677
|
+
final_count = await adapter.count_records()
|
|
1678
|
+
assert final_count == 4
|
|
1679
|
+
|
|
1680
|
+
# Verify doc_001 chunks are still searchable
|
|
1681
|
+
doc1_results = await adapter.search(VectorStoreQuery(query_string="population"))
|
|
1682
|
+
assert len(doc1_results) > 0
|
|
1683
|
+
assert all("doc_001" == result.document_id for result in doc1_results)
|
|
1684
|
+
|
|
1685
|
+
# Verify doc_002 chunks are gone
|
|
1686
|
+
doc2_results = await adapter.search(VectorStoreQuery(query_string="area"))
|
|
1687
|
+
assert len(doc2_results) == 0
|
|
1688
|
+
|
|
1689
|
+
|
|
1690
|
+
@pytest.mark.asyncio
|
|
1691
|
+
async def test_delete_nodes_not_in_set_empty_set(
|
|
1692
|
+
fts_vector_store_config,
|
|
1693
|
+
mock_chunked_documents,
|
|
1694
|
+
embedding_config,
|
|
1695
|
+
create_rag_config_factory,
|
|
1696
|
+
):
|
|
1697
|
+
"""Test delete_nodes_not_in_set with empty set - should delete all nodes."""
|
|
1698
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1699
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1700
|
+
|
|
1701
|
+
# Add both documents
|
|
1702
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
1703
|
+
|
|
1704
|
+
# Verify documents are there
|
|
1705
|
+
initial_count = await adapter.count_records()
|
|
1706
|
+
assert initial_count == 8
|
|
1707
|
+
|
|
1708
|
+
# Delete all nodes (empty keep set)
|
|
1709
|
+
empty_set = set()
|
|
1710
|
+
await adapter.delete_nodes_not_in_set(empty_set)
|
|
1711
|
+
|
|
1712
|
+
# Verify all nodes are deleted
|
|
1713
|
+
final_count = await adapter.count_records()
|
|
1714
|
+
assert final_count == 0
|
|
1715
|
+
|
|
1716
|
+
|
|
1717
|
+
@pytest.mark.asyncio
|
|
1718
|
+
async def test_delete_nodes_not_in_set_complete_set(
|
|
1719
|
+
fts_vector_store_config,
|
|
1720
|
+
mock_chunked_documents,
|
|
1721
|
+
embedding_config,
|
|
1722
|
+
create_rag_config_factory,
|
|
1723
|
+
):
|
|
1724
|
+
"""Test delete_nodes_not_in_set with complete set - should delete no nodes."""
|
|
1725
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1726
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1727
|
+
|
|
1728
|
+
# Add both documents
|
|
1729
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
1730
|
+
|
|
1731
|
+
# Verify documents are there
|
|
1732
|
+
initial_count = await adapter.count_records()
|
|
1733
|
+
assert initial_count == 8
|
|
1734
|
+
|
|
1735
|
+
# Keep all documents
|
|
1736
|
+
complete_set = {"doc_001", "doc_002"}
|
|
1737
|
+
await adapter.delete_nodes_not_in_set(complete_set)
|
|
1738
|
+
|
|
1739
|
+
# Verify no nodes are deleted
|
|
1740
|
+
final_count = await adapter.count_records()
|
|
1741
|
+
assert final_count == 8
|
|
1742
|
+
|
|
1743
|
+
# Verify both documents are still searchable
|
|
1744
|
+
doc1_results = await adapter.search(VectorStoreQuery(query_string="population"))
|
|
1745
|
+
assert len(doc1_results) > 0
|
|
1746
|
+
|
|
1747
|
+
doc2_results = await adapter.search(VectorStoreQuery(query_string="area"))
|
|
1748
|
+
assert len(doc2_results) > 0
|
|
1749
|
+
|
|
1750
|
+
|
|
1751
|
+
@pytest.mark.asyncio
|
|
1752
|
+
async def test_delete_nodes_not_in_set_partial_set(
|
|
1753
|
+
fts_vector_store_config,
|
|
1754
|
+
embedding_config,
|
|
1755
|
+
create_rag_config_factory,
|
|
1756
|
+
tmp_path,
|
|
1757
|
+
):
|
|
1758
|
+
"""Test delete_nodes_not_in_set with partial set - keep some, delete others."""
|
|
1759
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1760
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1761
|
+
|
|
1762
|
+
# Create three documents for more complex testing
|
|
1763
|
+
three_docs_data = {
|
|
1764
|
+
"keep_doc_1": [{"vector": [1.0, 1.0], "text": "Keep document one content"}],
|
|
1765
|
+
"delete_doc_2": [{"vector": [2.0, 2.0], "text": "Delete document two content"}],
|
|
1766
|
+
"keep_doc_3": [{"vector": [3.0, 3.0], "text": "Keep document three content"}],
|
|
1767
|
+
}
|
|
1768
|
+
three_docs = dicts_to_indexable_docs(three_docs_data, tmp_path)
|
|
1769
|
+
|
|
1770
|
+
# Add all three documents
|
|
1771
|
+
await adapter.add_chunks_with_embeddings(three_docs)
|
|
1772
|
+
|
|
1773
|
+
# Verify all documents are there
|
|
1774
|
+
initial_count = await adapter.count_records()
|
|
1775
|
+
assert initial_count == 3
|
|
1776
|
+
|
|
1777
|
+
# Keep documents 1 and 3, delete document 2
|
|
1778
|
+
keep_set = {"keep_doc_1", "keep_doc_3"}
|
|
1779
|
+
await adapter.delete_nodes_not_in_set(keep_set)
|
|
1780
|
+
|
|
1781
|
+
# Verify only 2 documents remain
|
|
1782
|
+
final_count = await adapter.count_records()
|
|
1783
|
+
assert final_count == 2
|
|
1784
|
+
|
|
1785
|
+
# Verify kept documents are still searchable using more specific terms
|
|
1786
|
+
keep1_results = await adapter.search(VectorStoreQuery(query_string="one"))
|
|
1787
|
+
assert len(keep1_results) == 1
|
|
1788
|
+
assert keep1_results[0].document_id == "keep_doc_1"
|
|
1789
|
+
|
|
1790
|
+
keep3_results = await adapter.search(VectorStoreQuery(query_string="three"))
|
|
1791
|
+
assert len(keep3_results) == 1
|
|
1792
|
+
assert keep3_results[0].document_id == "keep_doc_3"
|
|
1793
|
+
|
|
1794
|
+
# Verify deleted document is gone
|
|
1795
|
+
delete_results = await adapter.search(VectorStoreQuery(query_string="two"))
|
|
1796
|
+
assert len(delete_results) == 0
|
|
1797
|
+
|
|
1798
|
+
|
|
1799
|
+
@pytest.mark.asyncio
|
|
1800
|
+
async def test_delete_nodes_not_in_set_uninitialized_table(
|
|
1801
|
+
fts_vector_store_config,
|
|
1802
|
+
embedding_config,
|
|
1803
|
+
create_rag_config_factory,
|
|
1804
|
+
):
|
|
1805
|
+
"""Test delete_nodes_not_in_set with uninitialized table - should raise TableNotFoundError."""
|
|
1806
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1807
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1808
|
+
|
|
1809
|
+
# Don't add any data, so table remains uninitialized
|
|
1810
|
+
# The table property will raise TableNotFoundError when accessed
|
|
1811
|
+
with pytest.raises(TableNotFoundError, match="Table vectors is not initialized"):
|
|
1812
|
+
await adapter.delete_nodes_not_in_set({"doc_001"})
|
|
1813
|
+
|
|
1814
|
+
|
|
1815
|
+
@pytest.mark.asyncio
|
|
1816
|
+
async def test_delete_nodes_not_in_set_empty_table(
|
|
1817
|
+
fts_vector_store_config,
|
|
1818
|
+
mock_chunked_documents,
|
|
1819
|
+
embedding_config,
|
|
1820
|
+
create_rag_config_factory,
|
|
1821
|
+
):
|
|
1822
|
+
"""Test delete_nodes_not_in_set with empty table - should handle gracefully."""
|
|
1823
|
+
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1824
|
+
adapter = LanceDBAdapter(rag_config, fts_vector_store_config)
|
|
1825
|
+
|
|
1826
|
+
# Create table by adding data, then delete all to make it empty
|
|
1827
|
+
await adapter.add_chunks_with_embeddings(mock_chunked_documents)
|
|
1828
|
+
|
|
1829
|
+
# Delete all documents to make table empty but initialized
|
|
1830
|
+
await adapter.delete_nodes_not_in_set(set()) # Empty set deletes everything
|
|
1831
|
+
|
|
1832
|
+
# Verify table is empty
|
|
1833
|
+
initial_count = await adapter.count_records()
|
|
1834
|
+
assert initial_count == 0
|
|
1835
|
+
|
|
1836
|
+
# Try to delete from empty table - should not error
|
|
1837
|
+
await adapter.delete_nodes_not_in_set({"doc_001"})
|
|
1838
|
+
|
|
1839
|
+
# Verify count is still 0
|
|
1840
|
+
final_count = await adapter.count_records()
|
|
1841
|
+
assert final_count == 0
|