kiln-ai 0.22.0__py3-none-any.whl → 0.22.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/model_adapters/litellm_adapter.py +6 -2
- kiln_ai/adapters/vector_store/lancedb_adapter.py +24 -70
- kiln_ai/adapters/vector_store/lancedb_helpers.py +101 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +9 -16
- kiln_ai/adapters/vector_store/test_lancedb_helpers.py +142 -0
- kiln_ai/adapters/vector_store_loaders/__init__.py +0 -0
- kiln_ai/adapters/vector_store_loaders/test_lancedb_loader.py +282 -0
- kiln_ai/adapters/vector_store_loaders/test_vector_store_loader.py +544 -0
- kiln_ai/adapters/vector_store_loaders/vector_store_loader.py +91 -0
- kiln_ai/datamodel/tool_id.py +13 -0
- kiln_ai/tools/base_tool.py +18 -3
- kiln_ai/tools/kiln_task_tool.py +6 -2
- kiln_ai/tools/mcp_server_tool.py +6 -4
- kiln_ai/tools/rag_tools.py +7 -3
- {kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/METADATA +77 -1
- {kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/RECORD +18 -12
- {kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/WHEEL +0 -0
- {kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -31,7 +31,11 @@ from kiln_ai.adapters.model_adapters.base_adapter import (
|
|
|
31
31
|
)
|
|
32
32
|
from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
|
|
33
33
|
from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
|
|
34
|
-
from kiln_ai.tools.base_tool import
|
|
34
|
+
from kiln_ai.tools.base_tool import (
|
|
35
|
+
KilnToolInterface,
|
|
36
|
+
ToolCallContext,
|
|
37
|
+
ToolCallDefinition,
|
|
38
|
+
)
|
|
35
39
|
from kiln_ai.tools.kiln_task_tool import KilnTaskToolResult
|
|
36
40
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
37
41
|
from kiln_ai.utils.litellm import get_litellm_provider_info
|
|
@@ -560,7 +564,7 @@ class LiteLlmAdapter(BaseAdapter):
|
|
|
560
564
|
self._cached_available_tools = await self.available_tools()
|
|
561
565
|
return self._cached_available_tools
|
|
562
566
|
|
|
563
|
-
async def litellm_tools(self) -> list[
|
|
567
|
+
async def litellm_tools(self) -> list[ToolCallDefinition]:
|
|
564
568
|
available_tools = await self.cached_available_tools()
|
|
565
569
|
|
|
566
570
|
# LiteLLM takes the standard OpenAI-compatible tool call format
|
|
@@ -5,12 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Any, Dict, List, Literal, Optional, Set, TypedDict
|
|
6
6
|
|
|
7
7
|
from llama_index.core import StorageContext, VectorStoreIndex
|
|
8
|
-
from llama_index.core.schema import
|
|
9
|
-
BaseNode,
|
|
10
|
-
NodeRelationship,
|
|
11
|
-
RelatedNodeInfo,
|
|
12
|
-
TextNode,
|
|
13
|
-
)
|
|
8
|
+
from llama_index.core.schema import BaseNode, TextNode
|
|
14
9
|
from llama_index.core.vector_stores.types import (
|
|
15
10
|
VectorStoreQuery as LlamaIndexVectorStoreQuery,
|
|
16
11
|
)
|
|
@@ -24,15 +19,19 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
|
24
19
|
SearchResult,
|
|
25
20
|
VectorStoreQuery,
|
|
26
21
|
)
|
|
22
|
+
from kiln_ai.adapters.vector_store.lancedb_helpers import (
|
|
23
|
+
convert_to_llama_index_node,
|
|
24
|
+
deterministic_chunk_id,
|
|
25
|
+
lancedb_construct_from_config,
|
|
26
|
+
store_type_to_lancedb_query_type,
|
|
27
|
+
)
|
|
27
28
|
from kiln_ai.datamodel.rag import RagConfig
|
|
28
29
|
from kiln_ai.datamodel.vector_store import (
|
|
29
30
|
VectorStoreConfig,
|
|
30
|
-
VectorStoreType,
|
|
31
31
|
raise_exhaustive_enum_error,
|
|
32
32
|
)
|
|
33
33
|
from kiln_ai.utils.config import Config
|
|
34
34
|
from kiln_ai.utils.env import temporary_env
|
|
35
|
-
from kiln_ai.utils.uuid import string_to_uuid
|
|
36
35
|
|
|
37
36
|
logger = logging.getLogger(__name__)
|
|
38
37
|
|
|
@@ -48,6 +47,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
48
47
|
self,
|
|
49
48
|
rag_config: RagConfig,
|
|
50
49
|
vector_store_config: VectorStoreConfig,
|
|
50
|
+
lancedb_vector_store: LanceDBVectorStore | None = None,
|
|
51
51
|
):
|
|
52
52
|
super().__init__(rag_config, vector_store_config)
|
|
53
53
|
self.config_properties = self.vector_store_config.lancedb_properties
|
|
@@ -56,17 +56,15 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
56
56
|
if vector_store_config.lancedb_properties.nprobes is not None:
|
|
57
57
|
kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
|
|
58
58
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
**kwargs,
|
|
59
|
+
# allow overriding the vector store with a custom one, useful for user loading into an arbitrary
|
|
60
|
+
# deployment
|
|
61
|
+
self.lancedb_vector_store = (
|
|
62
|
+
lancedb_vector_store
|
|
63
|
+
or lancedb_construct_from_config(
|
|
64
|
+
vector_store_config,
|
|
65
|
+
uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
|
|
66
|
+
)
|
|
68
67
|
)
|
|
69
|
-
|
|
70
68
|
self._index = None
|
|
71
69
|
|
|
72
70
|
@property
|
|
@@ -149,7 +147,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
149
147
|
|
|
150
148
|
chunk_count_for_document = len(chunks)
|
|
151
149
|
deterministic_chunk_ids = [
|
|
152
|
-
|
|
150
|
+
deterministic_chunk_id(document_id, chunk_idx)
|
|
153
151
|
for chunk_idx in range(chunk_count_for_document)
|
|
154
152
|
]
|
|
155
153
|
|
|
@@ -176,42 +174,12 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
176
174
|
zip(chunks_text, embeddings)
|
|
177
175
|
):
|
|
178
176
|
node_batch.append(
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
convert_to_llama_index_node(
|
|
178
|
+
document_id=document_id,
|
|
179
|
+
chunk_idx=chunk_idx,
|
|
180
|
+
node_id=deterministic_chunk_id(document_id, chunk_idx),
|
|
181
181
|
text=chunk_text,
|
|
182
|
-
|
|
183
|
-
metadata={
|
|
184
|
-
# metadata is populated by some internal llama_index logic
|
|
185
|
-
# that uses for example the source_node relationship
|
|
186
|
-
"kiln_doc_id": document_id,
|
|
187
|
-
"kiln_chunk_idx": chunk_idx,
|
|
188
|
-
#
|
|
189
|
-
# llama_index lancedb vector store automatically sets these metadata:
|
|
190
|
-
# "doc_id": "UUID node_id of the Source Node relationship",
|
|
191
|
-
# "document_id": "UUID node_id of the Source Node relationship",
|
|
192
|
-
# "ref_doc_id": "UUID node_id of the Source Node relationship"
|
|
193
|
-
#
|
|
194
|
-
# llama_index file loaders set these metadata, which would be useful to also support:
|
|
195
|
-
# "creation_date": "2025-09-03",
|
|
196
|
-
# "file_name": "file.pdf",
|
|
197
|
-
# "file_path": "/absolute/path/to/the/file.pdf",
|
|
198
|
-
# "file_size": 395154,
|
|
199
|
-
# "file_type": "application\/pdf",
|
|
200
|
-
# "last_modified_date": "2025-09-03",
|
|
201
|
-
# "page_label": "1",
|
|
202
|
-
},
|
|
203
|
-
relationships={
|
|
204
|
-
# when using the llama_index loaders, llama_index groups Nodes under Documents
|
|
205
|
-
# and relationships point to the Document (which is also a Node), which confusingly
|
|
206
|
-
# enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
|
|
207
|
-
# the Document structure is not something that is persisted, so it is fine here
|
|
208
|
-
# if we have a relationship to a node_id that does not exist in the db
|
|
209
|
-
NodeRelationship.SOURCE: RelatedNodeInfo(
|
|
210
|
-
node_id=document_id,
|
|
211
|
-
node_type="1",
|
|
212
|
-
metadata={},
|
|
213
|
-
),
|
|
214
|
-
},
|
|
182
|
+
vector=embedding.vector,
|
|
215
183
|
)
|
|
216
184
|
)
|
|
217
185
|
|
|
@@ -330,10 +298,6 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
330
298
|
return []
|
|
331
299
|
raise
|
|
332
300
|
|
|
333
|
-
def compute_deterministic_chunk_id(self, document_id: str, chunk_idx: int) -> str:
|
|
334
|
-
# the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
|
|
335
|
-
return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
|
|
336
|
-
|
|
337
301
|
async def count_records(self) -> int:
|
|
338
302
|
try:
|
|
339
303
|
table = self.lancedb_vector_store.table
|
|
@@ -346,15 +310,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
346
310
|
|
|
347
311
|
@property
|
|
348
312
|
def query_type(self) -> Literal["fts", "hybrid", "vector"]:
|
|
349
|
-
|
|
350
|
-
case VectorStoreType.LANCE_DB_FTS:
|
|
351
|
-
return "fts"
|
|
352
|
-
case VectorStoreType.LANCE_DB_HYBRID:
|
|
353
|
-
return "hybrid"
|
|
354
|
-
case VectorStoreType.LANCE_DB_VECTOR:
|
|
355
|
-
return "vector"
|
|
356
|
-
case _:
|
|
357
|
-
raise_exhaustive_enum_error(self.vector_store_config.store_type)
|
|
313
|
+
return store_type_to_lancedb_query_type(self.vector_store_config.store_type)
|
|
358
314
|
|
|
359
315
|
@staticmethod
|
|
360
316
|
def lancedb_path_for_config(rag_config: RagConfig) -> str:
|
|
@@ -380,9 +336,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
|
380
336
|
kiln_doc_id = row["metadata"]["kiln_doc_id"]
|
|
381
337
|
if kiln_doc_id not in document_ids:
|
|
382
338
|
kiln_chunk_idx = row["metadata"]["kiln_chunk_idx"]
|
|
383
|
-
record_id =
|
|
384
|
-
kiln_doc_id, kiln_chunk_idx
|
|
385
|
-
)
|
|
339
|
+
record_id = deterministic_chunk_id(kiln_doc_id, kiln_chunk_idx)
|
|
386
340
|
rows_to_delete.append(record_id)
|
|
387
341
|
|
|
388
342
|
if rows_to_delete:
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Literal
|
|
2
|
+
|
|
3
|
+
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode
|
|
4
|
+
from llama_index.vector_stores.lancedb import LanceDBVectorStore
|
|
5
|
+
|
|
6
|
+
from kiln_ai.datamodel.vector_store import (
|
|
7
|
+
VectorStoreConfig,
|
|
8
|
+
VectorStoreType,
|
|
9
|
+
raise_exhaustive_enum_error,
|
|
10
|
+
)
|
|
11
|
+
from kiln_ai.utils.uuid import string_to_uuid
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def store_type_to_lancedb_query_type(
|
|
15
|
+
store_type: VectorStoreType,
|
|
16
|
+
) -> Literal["fts", "hybrid", "vector"]:
|
|
17
|
+
match store_type:
|
|
18
|
+
case VectorStoreType.LANCE_DB_FTS:
|
|
19
|
+
return "fts"
|
|
20
|
+
case VectorStoreType.LANCE_DB_HYBRID:
|
|
21
|
+
return "hybrid"
|
|
22
|
+
case VectorStoreType.LANCE_DB_VECTOR:
|
|
23
|
+
return "vector"
|
|
24
|
+
case _:
|
|
25
|
+
raise_exhaustive_enum_error(store_type)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def lancedb_construct_from_config(
|
|
29
|
+
vector_store_config: VectorStoreConfig,
|
|
30
|
+
uri: str,
|
|
31
|
+
**extra_params: Any,
|
|
32
|
+
) -> LanceDBVectorStore:
|
|
33
|
+
"""Construct a LanceDBVectorStore from a VectorStoreConfig."""
|
|
34
|
+
kwargs: Dict[str, Any] = {**extra_params}
|
|
35
|
+
if (
|
|
36
|
+
vector_store_config.lancedb_properties.nprobes is not None
|
|
37
|
+
and "nprobes" not in kwargs
|
|
38
|
+
):
|
|
39
|
+
kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
|
|
40
|
+
|
|
41
|
+
return LanceDBVectorStore(
|
|
42
|
+
mode="create",
|
|
43
|
+
query_type=store_type_to_lancedb_query_type(vector_store_config.store_type),
|
|
44
|
+
overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
|
|
45
|
+
vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
|
|
46
|
+
text_key=vector_store_config.lancedb_properties.text_key,
|
|
47
|
+
doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
|
|
48
|
+
uri=uri,
|
|
49
|
+
**kwargs,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def convert_to_llama_index_node(
|
|
54
|
+
document_id: str,
|
|
55
|
+
chunk_idx: int,
|
|
56
|
+
node_id: str,
|
|
57
|
+
text: str,
|
|
58
|
+
vector: List[float],
|
|
59
|
+
) -> TextNode:
|
|
60
|
+
return TextNode(
|
|
61
|
+
id_=node_id,
|
|
62
|
+
text=text,
|
|
63
|
+
embedding=vector,
|
|
64
|
+
metadata={
|
|
65
|
+
# metadata is populated by some internal llama_index logic
|
|
66
|
+
# that uses for example the source_node relationship
|
|
67
|
+
"kiln_doc_id": document_id,
|
|
68
|
+
"kiln_chunk_idx": chunk_idx,
|
|
69
|
+
#
|
|
70
|
+
# llama_index lancedb vector store automatically sets these metadata:
|
|
71
|
+
# "doc_id": "UUID node_id of the Source Node relationship",
|
|
72
|
+
# "document_id": "UUID node_id of the Source Node relationship",
|
|
73
|
+
# "ref_doc_id": "UUID node_id of the Source Node relationship"
|
|
74
|
+
#
|
|
75
|
+
# llama_index file loaders set these metadata, which would be useful to also support:
|
|
76
|
+
# "creation_date": "2025-09-03",
|
|
77
|
+
# "file_name": "file.pdf",
|
|
78
|
+
# "file_path": "/absolute/path/to/the/file.pdf",
|
|
79
|
+
# "file_size": 395154,
|
|
80
|
+
# "file_type": "application\/pdf",
|
|
81
|
+
# "last_modified_date": "2025-09-03",
|
|
82
|
+
# "page_label": "1",
|
|
83
|
+
},
|
|
84
|
+
relationships={
|
|
85
|
+
# when using the llama_index loaders, llama_index groups Nodes under Documents
|
|
86
|
+
# and relationships point to the Document (which is also a Node), which confusingly
|
|
87
|
+
# enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
|
|
88
|
+
# the Document structure is not something that is persisted, so it is fine here
|
|
89
|
+
# if we have a relationship to a node_id that does not exist in the db
|
|
90
|
+
NodeRelationship.SOURCE: RelatedNodeInfo(
|
|
91
|
+
node_id=document_id,
|
|
92
|
+
node_type="1",
|
|
93
|
+
metadata={},
|
|
94
|
+
),
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def deterministic_chunk_id(document_id: str, chunk_idx: int) -> str:
|
|
100
|
+
# the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
|
|
101
|
+
return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
|
|
@@ -17,6 +17,7 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
|
17
17
|
VectorStoreQuery,
|
|
18
18
|
)
|
|
19
19
|
from kiln_ai.adapters.vector_store.lancedb_adapter import LanceDBAdapter
|
|
20
|
+
from kiln_ai.adapters.vector_store.lancedb_helpers import deterministic_chunk_id
|
|
20
21
|
from kiln_ai.adapters.vector_store.vector_store_registry import (
|
|
21
22
|
vector_store_adapter_for_config,
|
|
22
23
|
)
|
|
@@ -925,9 +926,7 @@ async def test_get_nodes_by_ids_functionality(
|
|
|
925
926
|
await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
|
|
926
927
|
|
|
927
928
|
# Test getting nodes by IDs - compute expected IDs
|
|
928
|
-
expected_ids = [
|
|
929
|
-
adapter.compute_deterministic_chunk_id("doc_001", i) for i in range(4)
|
|
930
|
-
]
|
|
929
|
+
expected_ids = [deterministic_chunk_id("doc_001", i) for i in range(4)]
|
|
931
930
|
|
|
932
931
|
# Get nodes by IDs
|
|
933
932
|
retrieved_nodes = await adapter.get_nodes_by_ids(expected_ids)
|
|
@@ -943,7 +942,7 @@ async def test_get_nodes_by_ids_functionality(
|
|
|
943
942
|
assert len(node.get_content()) > 0
|
|
944
943
|
|
|
945
944
|
# Test with non-existent IDs
|
|
946
|
-
fake_ids = [
|
|
945
|
+
fake_ids = [deterministic_chunk_id("fake_doc", i) for i in range(2)]
|
|
947
946
|
retrieved_fake = await adapter.get_nodes_by_ids(fake_ids)
|
|
948
947
|
assert len(retrieved_fake) == 0
|
|
949
948
|
|
|
@@ -1019,7 +1018,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
|
|
|
1019
1018
|
# Test the UUID scheme: document_id::chunk_idx
|
|
1020
1019
|
for chunk_idx in range(4):
|
|
1021
1020
|
# Compute expected ID using the same scheme as the adapter
|
|
1022
|
-
expected_id =
|
|
1021
|
+
expected_id = deterministic_chunk_id("doc_001", chunk_idx)
|
|
1023
1022
|
|
|
1024
1023
|
# Retrieve the specific node by ID
|
|
1025
1024
|
retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
|
|
@@ -1053,7 +1052,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
|
|
|
1053
1052
|
|
|
1054
1053
|
# Test retrieval of doc_002 chunks
|
|
1055
1054
|
for chunk_idx in range(4):
|
|
1056
|
-
expected_id =
|
|
1055
|
+
expected_id = deterministic_chunk_id("doc_002", chunk_idx)
|
|
1057
1056
|
retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
|
|
1058
1057
|
assert len(retrieved_nodes) == 1
|
|
1059
1058
|
|
|
@@ -1080,25 +1079,19 @@ async def test_deterministic_chunk_id_consistency(
|
|
|
1080
1079
|
create_rag_config_factory,
|
|
1081
1080
|
):
|
|
1082
1081
|
"""Test that the deterministic chunk ID generation is consistent."""
|
|
1083
|
-
rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
|
|
1084
|
-
|
|
1085
|
-
adapter = LanceDBAdapter(
|
|
1086
|
-
rag_config,
|
|
1087
|
-
fts_vector_store_config,
|
|
1088
|
-
)
|
|
1089
1082
|
|
|
1090
1083
|
# Test that the same document_id and chunk_idx always produce the same UUID
|
|
1091
1084
|
doc_id = "test_doc_123"
|
|
1092
1085
|
chunk_idx = 5
|
|
1093
1086
|
|
|
1094
|
-
id1 =
|
|
1095
|
-
id2 =
|
|
1087
|
+
id1 = deterministic_chunk_id(doc_id, chunk_idx)
|
|
1088
|
+
id2 = deterministic_chunk_id(doc_id, chunk_idx)
|
|
1096
1089
|
|
|
1097
1090
|
assert id1 == id2
|
|
1098
1091
|
|
|
1099
1092
|
# Test that different inputs produce different UUIDs
|
|
1100
|
-
id3 =
|
|
1101
|
-
id4 =
|
|
1093
|
+
id3 = deterministic_chunk_id(doc_id, chunk_idx + 1)
|
|
1094
|
+
id4 = deterministic_chunk_id(doc_id + "_different", chunk_idx)
|
|
1102
1095
|
|
|
1103
1096
|
assert id1 != id3
|
|
1104
1097
|
assert id1 != id4
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from kiln_ai.adapters.vector_store.lancedb_helpers import (
|
|
6
|
+
convert_to_llama_index_node,
|
|
7
|
+
deterministic_chunk_id,
|
|
8
|
+
lancedb_construct_from_config,
|
|
9
|
+
store_type_to_lancedb_query_type,
|
|
10
|
+
)
|
|
11
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
|
|
12
|
+
from kiln_ai.utils.uuid import string_to_uuid
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _FakeLanceDBVectorStore:
|
|
16
|
+
def __init__(self, **kwargs):
|
|
17
|
+
self.kwargs = kwargs
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _base_properties(nprobes: int | None = None) -> dict[str, str | int | float | None]:
|
|
21
|
+
props: dict[str, str | int | float | None] = {
|
|
22
|
+
"similarity_top_k": 5,
|
|
23
|
+
"overfetch_factor": 2,
|
|
24
|
+
"vector_column_name": "vec",
|
|
25
|
+
"text_key": "text",
|
|
26
|
+
"doc_id_key": "doc_id",
|
|
27
|
+
}
|
|
28
|
+
if nprobes is not None:
|
|
29
|
+
props["nprobes"] = nprobes
|
|
30
|
+
return props
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _make_config(
|
|
34
|
+
store_type: VectorStoreType, nprobes: int | None = None
|
|
35
|
+
) -> VectorStoreConfig:
|
|
36
|
+
return VectorStoreConfig(
|
|
37
|
+
name="test_store",
|
|
38
|
+
description=None,
|
|
39
|
+
store_type=store_type,
|
|
40
|
+
properties=_base_properties(nprobes),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_store_type_to_lancedb_query_type_mapping():
|
|
45
|
+
assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_FTS) == "fts"
|
|
46
|
+
assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_HYBRID) == "hybrid"
|
|
47
|
+
assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_VECTOR) == "vector"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_store_type_to_lancedb_query_type_unsupported_raises():
|
|
51
|
+
with pytest.raises(Exception):
|
|
52
|
+
store_type_to_lancedb_query_type("unsupported") # type: ignore[arg-type]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_lancedb_construct_from_config_includes_nprobes():
|
|
56
|
+
with patch(
|
|
57
|
+
"kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
|
|
58
|
+
new=_FakeLanceDBVectorStore,
|
|
59
|
+
):
|
|
60
|
+
cfg = _make_config(VectorStoreType.LANCE_DB_VECTOR, nprobes=7)
|
|
61
|
+
|
|
62
|
+
result = lancedb_construct_from_config(
|
|
63
|
+
vector_store_config=cfg,
|
|
64
|
+
uri="memory://",
|
|
65
|
+
api_key="k",
|
|
66
|
+
region="r",
|
|
67
|
+
table_name="t",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
assert isinstance(result, _FakeLanceDBVectorStore)
|
|
71
|
+
kwargs = result.kwargs
|
|
72
|
+
|
|
73
|
+
assert kwargs["mode"] == "create"
|
|
74
|
+
assert kwargs["uri"] == "memory://"
|
|
75
|
+
assert kwargs["query_type"] == "vector"
|
|
76
|
+
assert kwargs["overfetch_factor"] == 2
|
|
77
|
+
assert kwargs["vector_column_name"] == "vec"
|
|
78
|
+
assert kwargs["text_key"] == "text"
|
|
79
|
+
assert kwargs["doc_id_key"] == "doc_id"
|
|
80
|
+
assert kwargs["api_key"] == "k"
|
|
81
|
+
assert kwargs["region"] == "r"
|
|
82
|
+
assert kwargs["table_name"] == "t"
|
|
83
|
+
# extra optional kwarg present when provided
|
|
84
|
+
assert kwargs["nprobes"] == 7
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_lancedb_construct_from_config_omits_nprobes_when_none():
|
|
88
|
+
with patch(
|
|
89
|
+
"kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
|
|
90
|
+
new=_FakeLanceDBVectorStore,
|
|
91
|
+
):
|
|
92
|
+
cfg = _make_config(VectorStoreType.LANCE_DB_FTS, nprobes=None)
|
|
93
|
+
|
|
94
|
+
result = lancedb_construct_from_config(
|
|
95
|
+
vector_store_config=cfg,
|
|
96
|
+
uri="memory://",
|
|
97
|
+
api_key=None,
|
|
98
|
+
region=None,
|
|
99
|
+
table_name=None,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
assert isinstance(result, _FakeLanceDBVectorStore)
|
|
103
|
+
kwargs = result.kwargs
|
|
104
|
+
|
|
105
|
+
assert kwargs["query_type"] == "fts"
|
|
106
|
+
assert "nprobes" not in kwargs
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_convert_to_llama_index_node_builds_expected_structure():
|
|
110
|
+
node = convert_to_llama_index_node(
|
|
111
|
+
document_id="doc-123",
|
|
112
|
+
chunk_idx=0,
|
|
113
|
+
node_id="11111111-1111-5111-8111-111111111111",
|
|
114
|
+
text="hello",
|
|
115
|
+
vector=[0.1, 0.2],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
assert node.id_ == "11111111-1111-5111-8111-111111111111"
|
|
119
|
+
assert node.text == "hello"
|
|
120
|
+
assert node.embedding == [0.1, 0.2]
|
|
121
|
+
assert node.metadata["kiln_doc_id"] == "doc-123"
|
|
122
|
+
assert node.metadata["kiln_chunk_idx"] == 0
|
|
123
|
+
|
|
124
|
+
# relationship exists and points to the source document id
|
|
125
|
+
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
|
|
126
|
+
|
|
127
|
+
assert NodeRelationship.SOURCE in node.relationships
|
|
128
|
+
related = node.relationships[NodeRelationship.SOURCE]
|
|
129
|
+
assert isinstance(related, RelatedNodeInfo)
|
|
130
|
+
assert related.node_id == "doc-123"
|
|
131
|
+
assert related.node_type == "1"
|
|
132
|
+
assert isinstance(related.metadata, dict)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_deterministic_chunk_id_uses_uuid_v5_namespace():
|
|
136
|
+
doc_id = "doc-abc"
|
|
137
|
+
idx = 3
|
|
138
|
+
expected = str(string_to_uuid(f"{doc_id}::{idx}"))
|
|
139
|
+
assert deterministic_chunk_id(doc_id, idx) == expected
|
|
140
|
+
|
|
141
|
+
# call again to ensure the same value is returned
|
|
142
|
+
assert deterministic_chunk_id(doc_id, idx) == expected
|
|
File without changes
|