kiln-ai 0.22.0__py3-none-any.whl → 0.22.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

@@ -31,7 +31,11 @@ from kiln_ai.adapters.model_adapters.base_adapter import (
31
31
  )
32
32
  from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
33
33
  from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
34
- from kiln_ai.tools.base_tool import KilnToolInterface, ToolCallContext
34
+ from kiln_ai.tools.base_tool import (
35
+ KilnToolInterface,
36
+ ToolCallContext,
37
+ ToolCallDefinition,
38
+ )
35
39
  from kiln_ai.tools.kiln_task_tool import KilnTaskToolResult
36
40
  from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
37
41
  from kiln_ai.utils.litellm import get_litellm_provider_info
@@ -560,7 +564,7 @@ class LiteLlmAdapter(BaseAdapter):
560
564
  self._cached_available_tools = await self.available_tools()
561
565
  return self._cached_available_tools
562
566
 
563
- async def litellm_tools(self) -> list[Dict]:
567
+ async def litellm_tools(self) -> list[ToolCallDefinition]:
564
568
  available_tools = await self.cached_available_tools()
565
569
 
566
570
  # LiteLLM takes the standard OpenAI-compatible tool call format
@@ -5,12 +5,7 @@ from pathlib import Path
5
5
  from typing import Any, Dict, List, Literal, Optional, Set, TypedDict
6
6
 
7
7
  from llama_index.core import StorageContext, VectorStoreIndex
8
- from llama_index.core.schema import (
9
- BaseNode,
10
- NodeRelationship,
11
- RelatedNodeInfo,
12
- TextNode,
13
- )
8
+ from llama_index.core.schema import BaseNode, TextNode
14
9
  from llama_index.core.vector_stores.types import (
15
10
  VectorStoreQuery as LlamaIndexVectorStoreQuery,
16
11
  )
@@ -24,15 +19,19 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
24
19
  SearchResult,
25
20
  VectorStoreQuery,
26
21
  )
22
+ from kiln_ai.adapters.vector_store.lancedb_helpers import (
23
+ convert_to_llama_index_node,
24
+ deterministic_chunk_id,
25
+ lancedb_construct_from_config,
26
+ store_type_to_lancedb_query_type,
27
+ )
27
28
  from kiln_ai.datamodel.rag import RagConfig
28
29
  from kiln_ai.datamodel.vector_store import (
29
30
  VectorStoreConfig,
30
- VectorStoreType,
31
31
  raise_exhaustive_enum_error,
32
32
  )
33
33
  from kiln_ai.utils.config import Config
34
34
  from kiln_ai.utils.env import temporary_env
35
- from kiln_ai.utils.uuid import string_to_uuid
36
35
 
37
36
  logger = logging.getLogger(__name__)
38
37
 
@@ -48,6 +47,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
48
47
  self,
49
48
  rag_config: RagConfig,
50
49
  vector_store_config: VectorStoreConfig,
50
+ lancedb_vector_store: LanceDBVectorStore | None = None,
51
51
  ):
52
52
  super().__init__(rag_config, vector_store_config)
53
53
  self.config_properties = self.vector_store_config.lancedb_properties
@@ -56,17 +56,15 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
56
56
  if vector_store_config.lancedb_properties.nprobes is not None:
57
57
  kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
58
58
 
59
- self.lancedb_vector_store = LanceDBVectorStore(
60
- mode="create",
61
- uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
62
- query_type=self.query_type,
63
- overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
64
- vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
65
- text_key=vector_store_config.lancedb_properties.text_key,
66
- doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
67
- **kwargs,
59
+ # allow overriding the vector store with a custom one, useful for user loading into an arbitrary
60
+ # deployment
61
+ self.lancedb_vector_store = (
62
+ lancedb_vector_store
63
+ or lancedb_construct_from_config(
64
+ vector_store_config,
65
+ uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
66
+ )
68
67
  )
69
-
70
68
  self._index = None
71
69
 
72
70
  @property
@@ -149,7 +147,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
149
147
 
150
148
  chunk_count_for_document = len(chunks)
151
149
  deterministic_chunk_ids = [
152
- self.compute_deterministic_chunk_id(document_id, chunk_idx)
150
+ deterministic_chunk_id(document_id, chunk_idx)
153
151
  for chunk_idx in range(chunk_count_for_document)
154
152
  ]
155
153
 
@@ -176,42 +174,12 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
176
174
  zip(chunks_text, embeddings)
177
175
  ):
178
176
  node_batch.append(
179
- TextNode(
180
- id_=deterministic_chunk_ids[chunk_idx],
177
+ convert_to_llama_index_node(
178
+ document_id=document_id,
179
+ chunk_idx=chunk_idx,
180
+ node_id=deterministic_chunk_id(document_id, chunk_idx),
181
181
  text=chunk_text,
182
- embedding=embedding.vector,
183
- metadata={
184
- # metadata is populated by some internal llama_index logic
185
- # that uses for example the source_node relationship
186
- "kiln_doc_id": document_id,
187
- "kiln_chunk_idx": chunk_idx,
188
- #
189
- # llama_index lancedb vector store automatically sets these metadata:
190
- # "doc_id": "UUID node_id of the Source Node relationship",
191
- # "document_id": "UUID node_id of the Source Node relationship",
192
- # "ref_doc_id": "UUID node_id of the Source Node relationship"
193
- #
194
- # llama_index file loaders set these metadata, which would be useful to also support:
195
- # "creation_date": "2025-09-03",
196
- # "file_name": "file.pdf",
197
- # "file_path": "/absolute/path/to/the/file.pdf",
198
- # "file_size": 395154,
199
- # "file_type": "application\/pdf",
200
- # "last_modified_date": "2025-09-03",
201
- # "page_label": "1",
202
- },
203
- relationships={
204
- # when using the llama_index loaders, llama_index groups Nodes under Documents
205
- # and relationships point to the Document (which is also a Node), which confusingly
206
- # enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
207
- # the Document structure is not something that is persisted, so it is fine here
208
- # if we have a relationship to a node_id that does not exist in the db
209
- NodeRelationship.SOURCE: RelatedNodeInfo(
210
- node_id=document_id,
211
- node_type="1",
212
- metadata={},
213
- ),
214
- },
182
+ vector=embedding.vector,
215
183
  )
216
184
  )
217
185
 
@@ -330,10 +298,6 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
330
298
  return []
331
299
  raise
332
300
 
333
- def compute_deterministic_chunk_id(self, document_id: str, chunk_idx: int) -> str:
334
- # the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
335
- return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
336
-
337
301
  async def count_records(self) -> int:
338
302
  try:
339
303
  table = self.lancedb_vector_store.table
@@ -346,15 +310,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
346
310
 
347
311
  @property
348
312
  def query_type(self) -> Literal["fts", "hybrid", "vector"]:
349
- match self.vector_store_config.store_type:
350
- case VectorStoreType.LANCE_DB_FTS:
351
- return "fts"
352
- case VectorStoreType.LANCE_DB_HYBRID:
353
- return "hybrid"
354
- case VectorStoreType.LANCE_DB_VECTOR:
355
- return "vector"
356
- case _:
357
- raise_exhaustive_enum_error(self.vector_store_config.store_type)
313
+ return store_type_to_lancedb_query_type(self.vector_store_config.store_type)
358
314
 
359
315
  @staticmethod
360
316
  def lancedb_path_for_config(rag_config: RagConfig) -> str:
@@ -380,9 +336,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
380
336
  kiln_doc_id = row["metadata"]["kiln_doc_id"]
381
337
  if kiln_doc_id not in document_ids:
382
338
  kiln_chunk_idx = row["metadata"]["kiln_chunk_idx"]
383
- record_id = self.compute_deterministic_chunk_id(
384
- kiln_doc_id, kiln_chunk_idx
385
- )
339
+ record_id = deterministic_chunk_id(kiln_doc_id, kiln_chunk_idx)
386
340
  rows_to_delete.append(record_id)
387
341
 
388
342
  if rows_to_delete:
@@ -0,0 +1,101 @@
1
+ from typing import Any, Dict, List, Literal
2
+
3
+ from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode
4
+ from llama_index.vector_stores.lancedb import LanceDBVectorStore
5
+
6
+ from kiln_ai.datamodel.vector_store import (
7
+ VectorStoreConfig,
8
+ VectorStoreType,
9
+ raise_exhaustive_enum_error,
10
+ )
11
+ from kiln_ai.utils.uuid import string_to_uuid
12
+
13
+
14
+ def store_type_to_lancedb_query_type(
15
+ store_type: VectorStoreType,
16
+ ) -> Literal["fts", "hybrid", "vector"]:
17
+ match store_type:
18
+ case VectorStoreType.LANCE_DB_FTS:
19
+ return "fts"
20
+ case VectorStoreType.LANCE_DB_HYBRID:
21
+ return "hybrid"
22
+ case VectorStoreType.LANCE_DB_VECTOR:
23
+ return "vector"
24
+ case _:
25
+ raise_exhaustive_enum_error(store_type)
26
+
27
+
28
+ def lancedb_construct_from_config(
29
+ vector_store_config: VectorStoreConfig,
30
+ uri: str,
31
+ **extra_params: Any,
32
+ ) -> LanceDBVectorStore:
33
+ """Construct a LanceDBVectorStore from a VectorStoreConfig."""
34
+ kwargs: Dict[str, Any] = {**extra_params}
35
+ if (
36
+ vector_store_config.lancedb_properties.nprobes is not None
37
+ and "nprobes" not in kwargs
38
+ ):
39
+ kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
40
+
41
+ return LanceDBVectorStore(
42
+ mode="create",
43
+ query_type=store_type_to_lancedb_query_type(vector_store_config.store_type),
44
+ overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
45
+ vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
46
+ text_key=vector_store_config.lancedb_properties.text_key,
47
+ doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
48
+ uri=uri,
49
+ **kwargs,
50
+ )
51
+
52
+
53
+ def convert_to_llama_index_node(
54
+ document_id: str,
55
+ chunk_idx: int,
56
+ node_id: str,
57
+ text: str,
58
+ vector: List[float],
59
+ ) -> TextNode:
60
+ return TextNode(
61
+ id_=node_id,
62
+ text=text,
63
+ embedding=vector,
64
+ metadata={
65
+ # metadata is populated by some internal llama_index logic
66
+ # that uses for example the source_node relationship
67
+ "kiln_doc_id": document_id,
68
+ "kiln_chunk_idx": chunk_idx,
69
+ #
70
+ # llama_index lancedb vector store automatically sets these metadata:
71
+ # "doc_id": "UUID node_id of the Source Node relationship",
72
+ # "document_id": "UUID node_id of the Source Node relationship",
73
+ # "ref_doc_id": "UUID node_id of the Source Node relationship"
74
+ #
75
+ # llama_index file loaders set these metadata, which would be useful to also support:
76
+ # "creation_date": "2025-09-03",
77
+ # "file_name": "file.pdf",
78
+ # "file_path": "/absolute/path/to/the/file.pdf",
79
+ # "file_size": 395154,
80
+ # "file_type": "application\/pdf",
81
+ # "last_modified_date": "2025-09-03",
82
+ # "page_label": "1",
83
+ },
84
+ relationships={
85
+ # when using the llama_index loaders, llama_index groups Nodes under Documents
86
+ # and relationships point to the Document (which is also a Node), which confusingly
87
+ # enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
88
+ # the Document structure is not something that is persisted, so it is fine here
89
+ # if we have a relationship to a node_id that does not exist in the db
90
+ NodeRelationship.SOURCE: RelatedNodeInfo(
91
+ node_id=document_id,
92
+ node_type="1",
93
+ metadata={},
94
+ ),
95
+ },
96
+ )
97
+
98
+
99
+ def deterministic_chunk_id(document_id: str, chunk_idx: int) -> str:
100
+ # the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
101
+ return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
@@ -17,6 +17,7 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
17
17
  VectorStoreQuery,
18
18
  )
19
19
  from kiln_ai.adapters.vector_store.lancedb_adapter import LanceDBAdapter
20
+ from kiln_ai.adapters.vector_store.lancedb_helpers import deterministic_chunk_id
20
21
  from kiln_ai.adapters.vector_store.vector_store_registry import (
21
22
  vector_store_adapter_for_config,
22
23
  )
@@ -925,9 +926,7 @@ async def test_get_nodes_by_ids_functionality(
925
926
  await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]]) # doc_001
926
927
 
927
928
  # Test getting nodes by IDs - compute expected IDs
928
- expected_ids = [
929
- adapter.compute_deterministic_chunk_id("doc_001", i) for i in range(4)
930
- ]
929
+ expected_ids = [deterministic_chunk_id("doc_001", i) for i in range(4)]
931
930
 
932
931
  # Get nodes by IDs
933
932
  retrieved_nodes = await adapter.get_nodes_by_ids(expected_ids)
@@ -943,7 +942,7 @@ async def test_get_nodes_by_ids_functionality(
943
942
  assert len(node.get_content()) > 0
944
943
 
945
944
  # Test with non-existent IDs
946
- fake_ids = [adapter.compute_deterministic_chunk_id("fake_doc", i) for i in range(2)]
945
+ fake_ids = [deterministic_chunk_id("fake_doc", i) for i in range(2)]
947
946
  retrieved_fake = await adapter.get_nodes_by_ids(fake_ids)
948
947
  assert len(retrieved_fake) == 0
949
948
 
@@ -1019,7 +1018,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
1019
1018
  # Test the UUID scheme: document_id::chunk_idx
1020
1019
  for chunk_idx in range(4):
1021
1020
  # Compute expected ID using the same scheme as the adapter
1022
- expected_id = adapter.compute_deterministic_chunk_id("doc_001", chunk_idx)
1021
+ expected_id = deterministic_chunk_id("doc_001", chunk_idx)
1023
1022
 
1024
1023
  # Retrieve the specific node by ID
1025
1024
  retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
@@ -1053,7 +1052,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
1053
1052
 
1054
1053
  # Test retrieval of doc_002 chunks
1055
1054
  for chunk_idx in range(4):
1056
- expected_id = adapter.compute_deterministic_chunk_id("doc_002", chunk_idx)
1055
+ expected_id = deterministic_chunk_id("doc_002", chunk_idx)
1057
1056
  retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
1058
1057
  assert len(retrieved_nodes) == 1
1059
1058
 
@@ -1080,25 +1079,19 @@ async def test_deterministic_chunk_id_consistency(
1080
1079
  create_rag_config_factory,
1081
1080
  ):
1082
1081
  """Test that the deterministic chunk ID generation is consistent."""
1083
- rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
1084
-
1085
- adapter = LanceDBAdapter(
1086
- rag_config,
1087
- fts_vector_store_config,
1088
- )
1089
1082
 
1090
1083
  # Test that the same document_id and chunk_idx always produce the same UUID
1091
1084
  doc_id = "test_doc_123"
1092
1085
  chunk_idx = 5
1093
1086
 
1094
- id1 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
1095
- id2 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
1087
+ id1 = deterministic_chunk_id(doc_id, chunk_idx)
1088
+ id2 = deterministic_chunk_id(doc_id, chunk_idx)
1096
1089
 
1097
1090
  assert id1 == id2
1098
1091
 
1099
1092
  # Test that different inputs produce different UUIDs
1100
- id3 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx + 1)
1101
- id4 = adapter.compute_deterministic_chunk_id(doc_id + "_different", chunk_idx)
1093
+ id3 = deterministic_chunk_id(doc_id, chunk_idx + 1)
1094
+ id4 = deterministic_chunk_id(doc_id + "_different", chunk_idx)
1102
1095
 
1103
1096
  assert id1 != id3
1104
1097
  assert id1 != id4
@@ -0,0 +1,142 @@
1
+ from unittest.mock import patch
2
+
3
+ import pytest
4
+
5
+ from kiln_ai.adapters.vector_store.lancedb_helpers import (
6
+ convert_to_llama_index_node,
7
+ deterministic_chunk_id,
8
+ lancedb_construct_from_config,
9
+ store_type_to_lancedb_query_type,
10
+ )
11
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
12
+ from kiln_ai.utils.uuid import string_to_uuid
13
+
14
+
15
+ class _FakeLanceDBVectorStore:
16
+ def __init__(self, **kwargs):
17
+ self.kwargs = kwargs
18
+
19
+
20
+ def _base_properties(nprobes: int | None = None) -> dict[str, str | int | float | None]:
21
+ props: dict[str, str | int | float | None] = {
22
+ "similarity_top_k": 5,
23
+ "overfetch_factor": 2,
24
+ "vector_column_name": "vec",
25
+ "text_key": "text",
26
+ "doc_id_key": "doc_id",
27
+ }
28
+ if nprobes is not None:
29
+ props["nprobes"] = nprobes
30
+ return props
31
+
32
+
33
+ def _make_config(
34
+ store_type: VectorStoreType, nprobes: int | None = None
35
+ ) -> VectorStoreConfig:
36
+ return VectorStoreConfig(
37
+ name="test_store",
38
+ description=None,
39
+ store_type=store_type,
40
+ properties=_base_properties(nprobes),
41
+ )
42
+
43
+
44
+ def test_store_type_to_lancedb_query_type_mapping():
45
+ assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_FTS) == "fts"
46
+ assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_HYBRID) == "hybrid"
47
+ assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_VECTOR) == "vector"
48
+
49
+
50
+ def test_store_type_to_lancedb_query_type_unsupported_raises():
51
+ with pytest.raises(Exception):
52
+ store_type_to_lancedb_query_type("unsupported") # type: ignore[arg-type]
53
+
54
+
55
+ def test_lancedb_construct_from_config_includes_nprobes():
56
+ with patch(
57
+ "kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
58
+ new=_FakeLanceDBVectorStore,
59
+ ):
60
+ cfg = _make_config(VectorStoreType.LANCE_DB_VECTOR, nprobes=7)
61
+
62
+ result = lancedb_construct_from_config(
63
+ vector_store_config=cfg,
64
+ uri="memory://",
65
+ api_key="k",
66
+ region="r",
67
+ table_name="t",
68
+ )
69
+
70
+ assert isinstance(result, _FakeLanceDBVectorStore)
71
+ kwargs = result.kwargs
72
+
73
+ assert kwargs["mode"] == "create"
74
+ assert kwargs["uri"] == "memory://"
75
+ assert kwargs["query_type"] == "vector"
76
+ assert kwargs["overfetch_factor"] == 2
77
+ assert kwargs["vector_column_name"] == "vec"
78
+ assert kwargs["text_key"] == "text"
79
+ assert kwargs["doc_id_key"] == "doc_id"
80
+ assert kwargs["api_key"] == "k"
81
+ assert kwargs["region"] == "r"
82
+ assert kwargs["table_name"] == "t"
83
+ # extra optional kwarg present when provided
84
+ assert kwargs["nprobes"] == 7
85
+
86
+
87
+ def test_lancedb_construct_from_config_omits_nprobes_when_none():
88
+ with patch(
89
+ "kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
90
+ new=_FakeLanceDBVectorStore,
91
+ ):
92
+ cfg = _make_config(VectorStoreType.LANCE_DB_FTS, nprobes=None)
93
+
94
+ result = lancedb_construct_from_config(
95
+ vector_store_config=cfg,
96
+ uri="memory://",
97
+ api_key=None,
98
+ region=None,
99
+ table_name=None,
100
+ )
101
+
102
+ assert isinstance(result, _FakeLanceDBVectorStore)
103
+ kwargs = result.kwargs
104
+
105
+ assert kwargs["query_type"] == "fts"
106
+ assert "nprobes" not in kwargs
107
+
108
+
109
+ def test_convert_to_llama_index_node_builds_expected_structure():
110
+ node = convert_to_llama_index_node(
111
+ document_id="doc-123",
112
+ chunk_idx=0,
113
+ node_id="11111111-1111-5111-8111-111111111111",
114
+ text="hello",
115
+ vector=[0.1, 0.2],
116
+ )
117
+
118
+ assert node.id_ == "11111111-1111-5111-8111-111111111111"
119
+ assert node.text == "hello"
120
+ assert node.embedding == [0.1, 0.2]
121
+ assert node.metadata["kiln_doc_id"] == "doc-123"
122
+ assert node.metadata["kiln_chunk_idx"] == 0
123
+
124
+ # relationship exists and points to the source document id
125
+ from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
126
+
127
+ assert NodeRelationship.SOURCE in node.relationships
128
+ related = node.relationships[NodeRelationship.SOURCE]
129
+ assert isinstance(related, RelatedNodeInfo)
130
+ assert related.node_id == "doc-123"
131
+ assert related.node_type == "1"
132
+ assert isinstance(related.metadata, dict)
133
+
134
+
135
+ def test_deterministic_chunk_id_uses_uuid_v5_namespace():
136
+ doc_id = "doc-abc"
137
+ idx = 3
138
+ expected = str(string_to_uuid(f"{doc_id}::{idx}"))
139
+ assert deterministic_chunk_id(doc_id, idx) == expected
140
+
141
+ # call again to ensure the same value is returned
142
+ assert deterministic_chunk_id(doc_id, idx) == expected
File without changes