kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show
  1. kiln_ai/adapters/__init__.py +8 -2
  2. kiln_ai/adapters/adapter_registry.py +43 -208
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/chunkers/__init__.py +13 -0
  6. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  7. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  8. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  9. kiln_ai/adapters/chunkers/helpers.py +23 -0
  10. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  11. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  12. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  13. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  14. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  15. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  16. kiln_ai/adapters/embedding/__init__.py +0 -0
  17. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  18. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  19. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  20. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  21. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  22. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  23. kiln_ai/adapters/eval/base_eval.py +2 -2
  24. kiln_ai/adapters/eval/eval_runner.py +9 -3
  25. kiln_ai/adapters/eval/g_eval.py +2 -2
  26. kiln_ai/adapters/eval/test_base_eval.py +2 -4
  27. kiln_ai/adapters/eval/test_g_eval.py +4 -5
  28. kiln_ai/adapters/extractors/__init__.py +18 -0
  29. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  30. kiln_ai/adapters/extractors/encoding.py +20 -0
  31. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  32. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  33. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  34. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  35. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  36. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  37. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  38. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  39. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  40. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  41. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  42. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  43. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  44. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  45. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  46. kiln_ai/adapters/ml_model_list.py +761 -37
  47. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  48. kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
  49. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  50. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
  51. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  52. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  53. kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
  54. kiln_ai/adapters/ollama_tools.py +69 -12
  55. kiln_ai/adapters/parsers/__init__.py +1 -1
  56. kiln_ai/adapters/provider_tools.py +205 -47
  57. kiln_ai/adapters/rag/deduplication.py +49 -0
  58. kiln_ai/adapters/rag/progress.py +252 -0
  59. kiln_ai/adapters/rag/rag_runners.py +844 -0
  60. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  61. kiln_ai/adapters/rag/test_progress.py +785 -0
  62. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  63. kiln_ai/adapters/remote_config.py +80 -8
  64. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  65. kiln_ai/adapters/run_output.py +3 -0
  66. kiln_ai/adapters/test_adapter_registry.py +657 -85
  67. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  68. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  69. kiln_ai/adapters/test_ml_model_list.py +251 -1
  70. kiln_ai/adapters/test_ollama_tools.py +340 -1
  71. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  72. kiln_ai/adapters/test_prompt_builders.py +1 -1
  73. kiln_ai/adapters/test_provider_tools.py +254 -8
  74. kiln_ai/adapters/test_remote_config.py +651 -58
  75. kiln_ai/adapters/vector_store/__init__.py +1 -0
  76. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  77. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  78. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  79. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  80. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  81. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  82. kiln_ai/datamodel/__init__.py +39 -34
  83. kiln_ai/datamodel/basemodel.py +170 -1
  84. kiln_ai/datamodel/chunk.py +158 -0
  85. kiln_ai/datamodel/datamodel_enums.py +28 -0
  86. kiln_ai/datamodel/embedding.py +64 -0
  87. kiln_ai/datamodel/eval.py +1 -1
  88. kiln_ai/datamodel/external_tool_server.py +298 -0
  89. kiln_ai/datamodel/extraction.py +303 -0
  90. kiln_ai/datamodel/json_schema.py +25 -10
  91. kiln_ai/datamodel/project.py +40 -1
  92. kiln_ai/datamodel/rag.py +79 -0
  93. kiln_ai/datamodel/registry.py +0 -15
  94. kiln_ai/datamodel/run_config.py +62 -0
  95. kiln_ai/datamodel/task.py +2 -77
  96. kiln_ai/datamodel/task_output.py +6 -1
  97. kiln_ai/datamodel/task_run.py +41 -0
  98. kiln_ai/datamodel/test_attachment.py +649 -0
  99. kiln_ai/datamodel/test_basemodel.py +4 -4
  100. kiln_ai/datamodel/test_chunk_models.py +317 -0
  101. kiln_ai/datamodel/test_dataset_split.py +1 -1
  102. kiln_ai/datamodel/test_embedding_models.py +448 -0
  103. kiln_ai/datamodel/test_eval_model.py +6 -6
  104. kiln_ai/datamodel/test_example_models.py +175 -0
  105. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  106. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  107. kiln_ai/datamodel/test_extraction_model.py +470 -0
  108. kiln_ai/datamodel/test_rag.py +641 -0
  109. kiln_ai/datamodel/test_registry.py +8 -3
  110. kiln_ai/datamodel/test_task.py +15 -47
  111. kiln_ai/datamodel/test_tool_id.py +320 -0
  112. kiln_ai/datamodel/test_vector_store.py +320 -0
  113. kiln_ai/datamodel/tool_id.py +105 -0
  114. kiln_ai/datamodel/vector_store.py +141 -0
  115. kiln_ai/tools/__init__.py +8 -0
  116. kiln_ai/tools/base_tool.py +82 -0
  117. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  118. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  119. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  120. kiln_ai/tools/mcp_server_tool.py +95 -0
  121. kiln_ai/tools/mcp_session_manager.py +246 -0
  122. kiln_ai/tools/rag_tools.py +157 -0
  123. kiln_ai/tools/test_base_tools.py +199 -0
  124. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  125. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  126. kiln_ai/tools/test_rag_tools.py +848 -0
  127. kiln_ai/tools/test_tool_registry.py +562 -0
  128. kiln_ai/tools/tool_registry.py +85 -0
  129. kiln_ai/utils/__init__.py +3 -0
  130. kiln_ai/utils/async_job_runner.py +62 -17
  131. kiln_ai/utils/config.py +24 -2
  132. kiln_ai/utils/env.py +15 -0
  133. kiln_ai/utils/filesystem.py +14 -0
  134. kiln_ai/utils/filesystem_cache.py +60 -0
  135. kiln_ai/utils/litellm.py +94 -0
  136. kiln_ai/utils/lock.py +100 -0
  137. kiln_ai/utils/mime_type.py +38 -0
  138. kiln_ai/utils/open_ai_types.py +94 -0
  139. kiln_ai/utils/pdf_utils.py +38 -0
  140. kiln_ai/utils/project_utils.py +17 -0
  141. kiln_ai/utils/test_async_job_runner.py +151 -35
  142. kiln_ai/utils/test_config.py +138 -1
  143. kiln_ai/utils/test_env.py +142 -0
  144. kiln_ai/utils/test_filesystem_cache.py +316 -0
  145. kiln_ai/utils/test_litellm.py +206 -0
  146. kiln_ai/utils/test_lock.py +185 -0
  147. kiln_ai/utils/test_mime_type.py +66 -0
  148. kiln_ai/utils/test_open_ai_types.py +131 -0
  149. kiln_ai/utils/test_pdf_utils.py +73 -0
  150. kiln_ai/utils/test_uuid.py +111 -0
  151. kiln_ai/utils/test_validation.py +524 -0
  152. kiln_ai/utils/uuid.py +9 -0
  153. kiln_ai/utils/validation.py +90 -0
  154. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
  155. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  156. kiln_ai-0.19.0.dist-info/RECORD +0 -115
  157. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  158. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,83 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Set
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
9
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
10
+ from kiln_ai.datamodel.rag import RagConfig
11
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class DocumentWithChunksAndEmbeddings:
18
+ document_id: str
19
+ chunked_document: ChunkedDocument
20
+ chunk_embeddings: ChunkEmbeddings
21
+
22
+ @property
23
+ def chunks(self) -> list[Chunk]:
24
+ return self.chunked_document.chunks
25
+
26
+ @property
27
+ def embeddings(self) -> list[Embedding]:
28
+ return self.chunk_embeddings.embeddings
29
+
30
+
31
+ class SearchResult(BaseModel):
32
+ document_id: str = Field(description="The id of the Kiln document.")
33
+ chunk_idx: int = Field(description="The index of the chunk.")
34
+ chunk_text: str = Field(description="The text of the chunk.")
35
+ similarity: float | None = Field(
36
+ description="The score of the chunk, which depends on the similarity metric used."
37
+ )
38
+
39
+
40
+ class VectorStoreQuery(BaseModel):
41
+ query_string: Optional[str] = Field(
42
+ description="The query string to search for.",
43
+ default=None,
44
+ )
45
+ query_embedding: Optional[List[float]] = Field(
46
+ description="The embedding of the query.",
47
+ default=None,
48
+ )
49
+
50
+
51
+ class BaseVectorStoreAdapter(ABC):
52
+ def __init__(self, rag_config: RagConfig, vector_store_config: VectorStoreConfig):
53
+ self.vector_store_config = vector_store_config
54
+ self.rag_config = rag_config
55
+
56
+ @abstractmethod
57
+ async def add_chunks_with_embeddings(
58
+ self,
59
+ doc_batch: list[DocumentWithChunksAndEmbeddings],
60
+ ) -> None:
61
+ pass
62
+
63
+ @abstractmethod
64
+ async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
65
+ pass
66
+
67
+ @abstractmethod
68
+ async def count_records(self) -> int:
69
+ pass
70
+
71
+ @abstractmethod
72
+ async def destroy(self) -> None:
73
+ pass
74
+
75
+ @abstractmethod
76
+ async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
77
+ """
78
+ Delete nodes that are not in the set of document IDs. Can be used for
79
+ reconciliation between filesystem state and vector store when non-idempotent
80
+ operations have been done - for example if the user deletes a document, or
81
+ untag a document that was targeted for indexing.
82
+ """
83
+ pass
@@ -0,0 +1,389 @@
1
+ import asyncio
2
+ import logging
3
+ import shutil
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Literal, Optional, Set, TypedDict
6
+
7
+ from llama_index.core import StorageContext, VectorStoreIndex
8
+ from llama_index.core.schema import (
9
+ BaseNode,
10
+ NodeRelationship,
11
+ RelatedNodeInfo,
12
+ TextNode,
13
+ )
14
+ from llama_index.core.vector_stores.types import (
15
+ VectorStoreQuery as LlamaIndexVectorStoreQuery,
16
+ )
17
+ from llama_index.core.vector_stores.types import VectorStoreQueryResult
18
+ from llama_index.vector_stores.lancedb import LanceDBVectorStore
19
+ from llama_index.vector_stores.lancedb.base import TableNotFoundError
20
+
21
+ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
22
+ BaseVectorStoreAdapter,
23
+ DocumentWithChunksAndEmbeddings,
24
+ SearchResult,
25
+ VectorStoreQuery,
26
+ )
27
+ from kiln_ai.datamodel.rag import RagConfig
28
+ from kiln_ai.datamodel.vector_store import (
29
+ VectorStoreConfig,
30
+ VectorStoreType,
31
+ raise_exhaustive_enum_error,
32
+ )
33
+ from kiln_ai.utils.config import Config
34
+ from kiln_ai.utils.env import temporary_env
35
+ from kiln_ai.utils.uuid import string_to_uuid
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class LanceDBAdapterQueryKwargs(TypedDict):
41
+ similarity_top_k: int
42
+ query_str: Optional[str]
43
+ query_embedding: Optional[List[float]]
44
+
45
+
46
+ class LanceDBAdapter(BaseVectorStoreAdapter):
47
+ def __init__(
48
+ self,
49
+ rag_config: RagConfig,
50
+ vector_store_config: VectorStoreConfig,
51
+ ):
52
+ super().__init__(rag_config, vector_store_config)
53
+ self.config_properties = self.vector_store_config.lancedb_properties
54
+
55
+ kwargs: Dict[str, Any] = {}
56
+ if vector_store_config.lancedb_properties.nprobes is not None:
57
+ kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
58
+
59
+ self.lancedb_vector_store = LanceDBVectorStore(
60
+ mode="create",
61
+ uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
62
+ query_type=self.query_type,
63
+ overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
64
+ vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
65
+ text_key=vector_store_config.lancedb_properties.text_key,
66
+ doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
67
+ **kwargs,
68
+ )
69
+
70
+ self._index = None
71
+
72
+ @property
73
+ def index(self) -> VectorStoreIndex:
74
+ """
75
+ - VectorStoreIndex is a wrapper around the underlying LanceDBVectorStore.
76
+ It exposes higher level operations, and you need to make sure our
77
+ implementation mirrors the upstream llama_index logic that it expects to have available (e.g. ref_doc_id)
78
+ - VectorStoreIndex throws on initialization if the underlying vector store is empty due to schema mismatch;
79
+ make sure there is data in the underlying vector store before calling this
80
+ """
81
+ if self._index is not None:
82
+ return self._index
83
+
84
+ storage_context = StorageContext.from_defaults(
85
+ vector_store=self.lancedb_vector_store
86
+ )
87
+
88
+ # embed_model=None in the constructor should initialize the embed model to a mock
89
+ # like it does elsewhere in llama_index. However, that is not happening for VectorStoreIndex
90
+ # because the constructor overrides None with "default" and tries to load OpenAI and
91
+ # expects OPENAI_API_KEY to be set
92
+ #
93
+ # Since our own implementation does not actually use OpenAI, we set a fake API key just to
94
+ # avoid the error
95
+ with temporary_env("OPENAI_API_KEY", "fake-api-key"):
96
+ self._index = VectorStoreIndex(
97
+ [],
98
+ storage_context=storage_context,
99
+ embed_model=None,
100
+ )
101
+
102
+ return self._index
103
+
104
+ async def delete_nodes_by_document_id(self, document_id: str) -> None:
105
+ # higher level operation that requires ref_doc_id to be set on the nodes
106
+ # which is set through the source node relationship
107
+ try:
108
+ self.index.delete_ref_doc(document_id)
109
+ except TableNotFoundError:
110
+ # Table doesn't exist yet, so there's nothing to delete
111
+ logger.debug(
112
+ f"Table not found while deleting nodes for document {document_id}, which is expected if the table does not exist yet"
113
+ )
114
+
115
+ async def get_nodes_by_ids(self, node_ids: List[str]) -> List[BaseNode]:
116
+ try:
117
+ chunk_ids_in_database = await self.lancedb_vector_store.aget_nodes(
118
+ node_ids=node_ids
119
+ )
120
+ return chunk_ids_in_database
121
+ except TableNotFoundError:
122
+ logger.warning(
123
+ "Table not found while getting nodes by ids, which may be expected if the table does not exist yet",
124
+ )
125
+ return []
126
+
127
+ async def add_chunks_with_embeddings(
128
+ self,
129
+ doc_batch: list[DocumentWithChunksAndEmbeddings],
130
+ nodes_batch_size: int = 100,
131
+ ) -> None:
132
+ if len(doc_batch) == 0:
133
+ return
134
+
135
+ node_batch: List[TextNode] = []
136
+ for doc in doc_batch:
137
+ document_id = doc.document_id
138
+ chunks = doc.chunks
139
+ embeddings = doc.embeddings
140
+
141
+ # the lancedb vector store implementation is sync (even though it has an async API)
142
+ # so we sleep to avoid blocking the event loop - that allows other async ops to run
143
+ await asyncio.sleep(0)
144
+
145
+ if len(embeddings) != len(chunks):
146
+ raise RuntimeError(
147
+ f"Number of embeddings ({len(embeddings)}) does not match number of chunks ({len(chunks)}) for document {document_id}"
148
+ )
149
+
150
+ chunk_count_for_document = len(chunks)
151
+ deterministic_chunk_ids = [
152
+ self.compute_deterministic_chunk_id(document_id, chunk_idx)
153
+ for chunk_idx in range(chunk_count_for_document)
154
+ ]
155
+
156
+ # check if the chunk ids are already in the database
157
+ chunk_ids_in_database = await self.get_nodes_by_ids(deterministic_chunk_ids)
158
+
159
+ # we already have all the chunks for this document in the database
160
+ if len(chunk_ids_in_database) == chunk_count_for_document:
161
+ # free up event loop to avoid risk of looping for a long time
162
+ # without any real async ops releasing the event loop at all
163
+ # (get_nodes_by_ids implementation in llama_index is actually sync
164
+ # and it is slow)
165
+ continue
166
+ else:
167
+ # the chunks are different, which is because either:
168
+ # - an upstream sync conflict caused multiple chunked documents to be created and the incoming one
169
+ # is different; we need to delete all the chunks for this document otherwise there can be lingering stale chunks
170
+ # that are not in the incoming batch if current is longer than incoming
171
+ # - an incomplete indexing of this same chunked doc, upserting is enough to overwrite the current chunked doc fully
172
+ await self.delete_nodes_by_document_id(document_id)
173
+
174
+ chunks_text = await doc.chunked_document.load_chunks_text()
175
+ for chunk_idx, (chunk_text, embedding) in enumerate(
176
+ zip(chunks_text, embeddings)
177
+ ):
178
+ node_batch.append(
179
+ TextNode(
180
+ id_=deterministic_chunk_ids[chunk_idx],
181
+ text=chunk_text,
182
+ embedding=embedding.vector,
183
+ metadata={
184
+ # metadata is populated by some internal llama_index logic
185
+ # that uses for example the source_node relationship
186
+ "kiln_doc_id": document_id,
187
+ "kiln_chunk_idx": chunk_idx,
188
+ #
189
+ # llama_index lancedb vector store automatically sets these metadata:
190
+ # "doc_id": "UUID node_id of the Source Node relationship",
191
+ # "document_id": "UUID node_id of the Source Node relationship",
192
+ # "ref_doc_id": "UUID node_id of the Source Node relationship"
193
+ #
194
+ # llama_index file loaders set these metadata, which would be useful to also support:
195
+ # "creation_date": "2025-09-03",
196
+ # "file_name": "file.pdf",
197
+ # "file_path": "/absolute/path/to/the/file.pdf",
198
+ # "file_size": 395154,
199
+ # "file_type": "application\/pdf",
200
+ # "last_modified_date": "2025-09-03",
201
+ # "page_label": "1",
202
+ },
203
+ relationships={
204
+ # when using the llama_index loaders, llama_index groups Nodes under Documents
205
+ # and relationships point to the Document (which is also a Node), which confusingly
206
+ # enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
207
+ # the Document structure is not something that is persisted, so it is fine here
208
+ # if we have a relationship to a node_id that does not exist in the db
209
+ NodeRelationship.SOURCE: RelatedNodeInfo(
210
+ node_id=document_id,
211
+ node_type="1",
212
+ metadata={},
213
+ ),
214
+ },
215
+ )
216
+ )
217
+
218
+ if len(node_batch) >= nodes_batch_size:
219
+ # async_add is currently not async, LanceDB has an async API but
220
+ # llama_index does not use it, so it is synchronous and blocking
221
+ # avoid calling with too many nodes at once
222
+ await self.lancedb_vector_store.async_add(node_batch)
223
+ node_batch.clear()
224
+
225
+ if node_batch:
226
+ await self.lancedb_vector_store.async_add(node_batch)
227
+ node_batch.clear()
228
+
229
+ def format_query_result(
230
+ self, query_result: VectorStoreQueryResult
231
+ ) -> List[SearchResult]:
232
+ # Handle case where no results are found - return empty list
233
+ if (
234
+ query_result.ids is None
235
+ or query_result.nodes is None
236
+ or query_result.similarities is None
237
+ ):
238
+ # If any of the fields are None (which shouldn't happen normally),
239
+ # return empty results instead of raising an error
240
+ return []
241
+
242
+ # If all fields exist but are empty lists, that's a valid empty result
243
+ if (
244
+ len(query_result.ids) == 0
245
+ and len(query_result.nodes) == 0
246
+ and len(query_result.similarities) == 0
247
+ ):
248
+ return []
249
+
250
+ if not (
251
+ len(query_result.ids)
252
+ == len(query_result.nodes)
253
+ == len(query_result.similarities)
254
+ ):
255
+ raise ValueError("ids, nodes, and similarities must have the same length")
256
+
257
+ results = []
258
+ for _, node, similarity in zip(
259
+ query_result.ids or [],
260
+ query_result.nodes or [],
261
+ query_result.similarities or [],
262
+ ):
263
+ if node.metadata is None:
264
+ raise ValueError("node.metadata must not be None")
265
+ document_id = node.metadata.get("kiln_doc_id")
266
+ if document_id is None:
267
+ raise ValueError("node.metadata.kiln_doc_id must not be None")
268
+ chunk_idx = node.metadata.get("kiln_chunk_idx")
269
+ if chunk_idx is None:
270
+ raise ValueError("node.metadata.kiln_chunk_idx must not be None")
271
+ results.append(
272
+ SearchResult(
273
+ document_id=document_id,
274
+ chunk_idx=chunk_idx,
275
+ chunk_text=node.get_content(),
276
+ similarity=similarity,
277
+ )
278
+ )
279
+ return results
280
+
281
+ def build_kwargs_for_query(
282
+ self, query: VectorStoreQuery
283
+ ) -> LanceDBAdapterQueryKwargs:
284
+ kwargs: LanceDBAdapterQueryKwargs = {
285
+ "similarity_top_k": self.config_properties.similarity_top_k,
286
+ "query_str": None,
287
+ "query_embedding": None,
288
+ }
289
+
290
+ match self.query_type:
291
+ case "fts":
292
+ if query.query_string is None:
293
+ raise ValueError("query_string must be provided for fts search")
294
+ kwargs["query_str"] = query.query_string
295
+ case "hybrid":
296
+ if query.query_embedding is None or query.query_string is None:
297
+ raise ValueError(
298
+ "query_string and query_embedding must be provided for hybrid search"
299
+ )
300
+ kwargs["query_embedding"] = query.query_embedding
301
+ kwargs["query_str"] = query.query_string
302
+ case "vector":
303
+ if not query.query_embedding:
304
+ raise ValueError(
305
+ "query_embedding must be provided for vector search"
306
+ )
307
+ kwargs["query_embedding"] = query.query_embedding
308
+ case _:
309
+ raise_exhaustive_enum_error(self.query_type)
310
+ return kwargs
311
+
312
+ async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
313
+ try:
314
+ query_result = await self.lancedb_vector_store.aquery(
315
+ LlamaIndexVectorStoreQuery(
316
+ **self.build_kwargs_for_query(query),
317
+ ),
318
+ query_type=self.query_type,
319
+ )
320
+ return self.format_query_result(query_result)
321
+ except TableNotFoundError as e:
322
+ logger.info("Vector store search returned no results: %s", e)
323
+ return []
324
+ except Warning as e:
325
+ msg = str(e).lower()
326
+ if ("query results are empty" in msg) or (
327
+ "empty" in msg and "result" in msg
328
+ ):
329
+ logger.warning("Vector store search returned no results: %s", e)
330
+ return []
331
+ raise
332
+
333
+ def compute_deterministic_chunk_id(self, document_id: str, chunk_idx: int) -> str:
334
+ # the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
335
+ return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
336
+
337
+ async def count_records(self) -> int:
338
+ try:
339
+ table = self.lancedb_vector_store.table
340
+ if table is None:
341
+ raise ValueError("Table is not initialized")
342
+ count = table.count_rows()
343
+ return count
344
+ except TableNotFoundError:
345
+ return 0
346
+
347
+ @property
348
+ def query_type(self) -> Literal["fts", "hybrid", "vector"]:
349
+ match self.vector_store_config.store_type:
350
+ case VectorStoreType.LANCE_DB_FTS:
351
+ return "fts"
352
+ case VectorStoreType.LANCE_DB_HYBRID:
353
+ return "hybrid"
354
+ case VectorStoreType.LANCE_DB_VECTOR:
355
+ return "vector"
356
+ case _:
357
+ raise_exhaustive_enum_error(self.vector_store_config.store_type)
358
+
359
+ @staticmethod
360
+ def lancedb_path_for_config(rag_config: RagConfig) -> str:
361
+ data_dir = Path(Config.settings_dir())
362
+ if rag_config.id is None:
363
+ raise ValueError("Vector store config ID is required")
364
+ return str(data_dir / "rag_indexes" / "lancedb" / rag_config.id)
365
+
366
+ async def destroy(self) -> None:
367
+ lancedb_path = LanceDBAdapter.lancedb_path_for_config(self.rag_config)
368
+ shutil.rmtree(lancedb_path)
369
+
370
+ async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
371
+ tbl = self.lancedb_vector_store.table
372
+ if tbl is None:
373
+ raise ValueError("Table is not initialized")
374
+
375
+ for batch in tbl.search().to_batches(100):
376
+ batch = batch.to_pandas()
377
+
378
+ rows_to_delete = []
379
+ for _, row in batch.iterrows():
380
+ kiln_doc_id = row["metadata"]["kiln_doc_id"]
381
+ if kiln_doc_id not in document_ids:
382
+ kiln_chunk_idx = row["metadata"]["kiln_chunk_idx"]
383
+ record_id = self.compute_deterministic_chunk_id(
384
+ kiln_doc_id, kiln_chunk_idx
385
+ )
386
+ rows_to_delete.append(record_id)
387
+
388
+ if rows_to_delete:
389
+ self.lancedb_vector_store.delete_nodes(rows_to_delete)
@@ -0,0 +1,160 @@
1
+ from typing import List, Set, Tuple
2
+ from unittest.mock import MagicMock
3
+
4
+ import pytest
5
+
6
+ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
7
+ BaseVectorStoreAdapter,
8
+ DocumentWithChunksAndEmbeddings,
9
+ SearchResult,
10
+ VectorStoreQuery,
11
+ )
12
+ from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
13
+ from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
14
+ from kiln_ai.datamodel.rag import RagConfig
15
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig
16
+
17
+
18
+ class TestBaseVectorStoreAdapter:
19
+ """Test the base vector store adapter abstract class."""
20
+
21
+ def test_init_stores_config(self):
22
+ """Test that the adapter stores the vector store config."""
23
+
24
+ # Create a concrete implementation for testing
25
+ class ConcreteAdapter(BaseVectorStoreAdapter):
26
+ async def add_chunks_with_embeddings(
27
+ self,
28
+ records: List[Tuple[str, ChunkedDocument, ChunkEmbeddings]],
29
+ ) -> None:
30
+ pass
31
+
32
+ async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
33
+ return []
34
+
35
+ async def count_records(self) -> int:
36
+ return 0
37
+
38
+ async def destroy(self) -> None:
39
+ pass
40
+
41
+ async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
42
+ pass
43
+
44
+ config = MagicMock(spec=VectorStoreConfig)
45
+ adapter = ConcreteAdapter(MagicMock(spec=RagConfig), config)
46
+ assert adapter.vector_store_config is config
47
+
48
+
49
+ class TestVectorStoreQuery:
50
+ """Test the VectorStoreQuery model."""
51
+
52
+ def test_default_values(self):
53
+ """Test that the query model has correct default values."""
54
+ query = VectorStoreQuery()
55
+ assert query.query_string is None
56
+ assert query.query_embedding is None
57
+
58
+ def test_with_query_string(self):
59
+ """Test creating a query with a query string."""
60
+ query = VectorStoreQuery(query_string="test query")
61
+ assert query.query_string == "test query"
62
+ assert query.query_embedding is None
63
+
64
+ def test_with_query_embedding(self):
65
+ """Test creating a query with an embedding."""
66
+ embedding = [0.1, 0.2, 0.3]
67
+ query = VectorStoreQuery(query_embedding=embedding)
68
+ assert query.query_string is None
69
+ assert query.query_embedding == embedding
70
+
71
+ def test_with_both_values(self):
72
+ """Test creating a query with both string and embedding."""
73
+ embedding = [0.1, 0.2, 0.3]
74
+ query = VectorStoreQuery(query_string="test query", query_embedding=embedding)
75
+ assert query.query_string == "test query"
76
+ assert query.query_embedding == embedding
77
+
78
+
79
+ class TestSearchResult:
80
+ """Test the SearchResult model."""
81
+
82
+ def test_required_fields(self):
83
+ """Test creating a search result with required fields."""
84
+ result = SearchResult(
85
+ document_id="doc123",
86
+ chunk_text="This is a test chunk",
87
+ similarity=0.95,
88
+ chunk_idx=0,
89
+ )
90
+ assert result.document_id == "doc123"
91
+ assert result.chunk_text == "This is a test chunk"
92
+ assert result.similarity == 0.95
93
+
94
+ def test_optional_similarity(self):
95
+ """Test that similarity can be None."""
96
+ result = SearchResult(
97
+ document_id="doc123",
98
+ chunk_text="This is a test chunk",
99
+ similarity=None,
100
+ chunk_idx=0,
101
+ )
102
+ assert result.document_id == "doc123"
103
+ assert result.chunk_text == "This is a test chunk"
104
+ assert result.similarity is None
105
+
106
+
107
+ def test_document_with_chunks_and_embeddings_properties():
108
+ """Test that DocumentWithChunksAndEmbeddings virtual properties work correctly."""
109
+ # Create mock chunked document with chunks
110
+ mock_chunk1 = MagicMock(spec=Chunk)
111
+ mock_chunk2 = MagicMock(spec=Chunk)
112
+ mock_chunked_document = MagicMock(spec=ChunkedDocument)
113
+ mock_chunked_document.chunks = [mock_chunk1, mock_chunk2]
114
+
115
+ # Create mock chunk embeddings with embeddings
116
+ mock_embedding1 = MagicMock(spec=Embedding)
117
+ mock_embedding2 = MagicMock(spec=Embedding)
118
+ mock_chunk_embeddings = MagicMock(spec=ChunkEmbeddings)
119
+ mock_chunk_embeddings.embeddings = [mock_embedding1, mock_embedding2]
120
+
121
+ # Create DocumentWithChunksAndEmbeddings instance
122
+ doc_with_chunks = DocumentWithChunksAndEmbeddings(
123
+ document_id="test-doc-123",
124
+ chunked_document=mock_chunked_document,
125
+ chunk_embeddings=mock_chunk_embeddings,
126
+ )
127
+
128
+ # Test that properties return the correct values
129
+ assert doc_with_chunks.document_id == "test-doc-123"
130
+ assert doc_with_chunks.chunks == [mock_chunk1, mock_chunk2]
131
+ assert doc_with_chunks.embeddings == [mock_embedding1, mock_embedding2]
132
+
133
+ # Test that properties are read-only (no setters)
134
+ with pytest.raises(AttributeError):
135
+ doc_with_chunks.chunks = []
136
+
137
+ with pytest.raises(AttributeError):
138
+ doc_with_chunks.embeddings = []
139
+
140
+
141
+ def test_document_with_chunks_and_embeddings_empty_lists():
142
+ """Test DocumentWithChunksAndEmbeddings with empty chunks and embeddings."""
143
+ # Create mock objects with empty lists
144
+ mock_chunked_document = MagicMock(spec=ChunkedDocument)
145
+ mock_chunked_document.chunks = []
146
+
147
+ mock_chunk_embeddings = MagicMock(spec=ChunkEmbeddings)
148
+ mock_chunk_embeddings.embeddings = []
149
+
150
+ # Create DocumentWithChunksAndEmbeddings instance
151
+ doc_with_chunks = DocumentWithChunksAndEmbeddings(
152
+ document_id="empty-doc",
153
+ chunked_document=mock_chunked_document,
154
+ chunk_embeddings=mock_chunk_embeddings,
155
+ )
156
+
157
+ # Test that properties return empty lists
158
+ assert doc_with_chunks.document_id == "empty-doc"
159
+ assert doc_with_chunks.chunks == []
160
+ assert doc_with_chunks.embeddings == []