kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +6 -0
- kiln_ai/adapters/adapter_registry.py +43 -226
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/eval_runner.py +6 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -3
- kiln_ai/adapters/eval/test_g_eval.py +1 -1
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +494 -0
- kiln_ai/adapters/ml_model_list.py +876 -18
- kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/provider_tools.py +190 -46
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/test_adapter_registry.py +579 -86
- kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
- kiln_ai/adapters/test_ml_model_list.py +202 -0
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +199 -8
- kiln_ai/adapters/test_remote_config.py +551 -56
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +16 -13
- kiln_ai/datamodel/basemodel.py +201 -4
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +27 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/external_tool_server.py +206 -54
- kiln_ai/datamodel/extraction.py +317 -0
- kiln_ai/datamodel/project.py +33 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/task.py +5 -0
- kiln_ai/datamodel/task_output.py +41 -11
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +270 -14
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_datasource.py +50 -0
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_external_tool_server.py +534 -152
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +501 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_task.py +35 -1
- kiln_ai/datamodel/test_tool_id.py +187 -1
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +58 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/base_tool.py +12 -3
- kiln_ai/tools/built_in_tools/math_tools.py +12 -4
- kiln_ai/tools/kiln_task_tool.py +158 -0
- kiln_ai/tools/mcp_server_tool.py +2 -2
- kiln_ai/tools/mcp_session_manager.py +51 -22
- kiln_ai/tools/rag_tools.py +164 -0
- kiln_ai/tools/test_kiln_task_tool.py +527 -0
- kiln_ai/tools/test_mcp_server_tool.py +4 -15
- kiln_ai/tools/test_mcp_session_manager.py +187 -227
- kiln_ai/tools/test_rag_tools.py +929 -0
- kiln_ai/tools/test_tool_registry.py +290 -7
- kiln_ai/tools/tool_registry.py +69 -16
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +2 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +19 -2
- kiln_ai/utils/pdf_utils.py +59 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +88 -12
- kiln_ai/utils/test_pdf_utils.py +86 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
- kiln_ai-0.22.0.dist-info/RECORD +213 -0
- kiln_ai-0.20.1.dist-info/RECORD +0 -138
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Optional, Set
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
|
|
9
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
|
|
10
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
11
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class DocumentWithChunksAndEmbeddings:
|
|
18
|
+
document_id: str
|
|
19
|
+
chunked_document: ChunkedDocument
|
|
20
|
+
chunk_embeddings: ChunkEmbeddings
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def chunks(self) -> list[Chunk]:
|
|
24
|
+
return self.chunked_document.chunks
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def embeddings(self) -> list[Embedding]:
|
|
28
|
+
return self.chunk_embeddings.embeddings
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SearchResult(BaseModel):
|
|
32
|
+
document_id: str = Field(description="The id of the Kiln document.")
|
|
33
|
+
chunk_idx: int = Field(description="The index of the chunk.")
|
|
34
|
+
chunk_text: str = Field(description="The text of the chunk.")
|
|
35
|
+
similarity: float | None = Field(
|
|
36
|
+
description="The score of the chunk, which depends on the similarity metric used."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class VectorStoreQuery(BaseModel):
|
|
41
|
+
query_string: Optional[str] = Field(
|
|
42
|
+
description="The query string to search for.",
|
|
43
|
+
default=None,
|
|
44
|
+
)
|
|
45
|
+
query_embedding: Optional[List[float]] = Field(
|
|
46
|
+
description="The embedding of the query.",
|
|
47
|
+
default=None,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BaseVectorStoreAdapter(ABC):
|
|
52
|
+
def __init__(self, rag_config: RagConfig, vector_store_config: VectorStoreConfig):
|
|
53
|
+
self.vector_store_config = vector_store_config
|
|
54
|
+
self.rag_config = rag_config
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
async def add_chunks_with_embeddings(
|
|
58
|
+
self,
|
|
59
|
+
doc_batch: list[DocumentWithChunksAndEmbeddings],
|
|
60
|
+
) -> None:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
async def count_records(self) -> int:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
async def destroy(self) -> None:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Delete nodes that are not in the set of document IDs. Can be used for
|
|
79
|
+
reconciliation between filesystem state and vector store when non-idempotent
|
|
80
|
+
operations have been done - for example if the user deletes a document, or
|
|
81
|
+
untag a document that was targeted for indexing.
|
|
82
|
+
"""
|
|
83
|
+
pass
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Set, TypedDict
|
|
6
|
+
|
|
7
|
+
from llama_index.core import StorageContext, VectorStoreIndex
|
|
8
|
+
from llama_index.core.schema import (
|
|
9
|
+
BaseNode,
|
|
10
|
+
NodeRelationship,
|
|
11
|
+
RelatedNodeInfo,
|
|
12
|
+
TextNode,
|
|
13
|
+
)
|
|
14
|
+
from llama_index.core.vector_stores.types import (
|
|
15
|
+
VectorStoreQuery as LlamaIndexVectorStoreQuery,
|
|
16
|
+
)
|
|
17
|
+
from llama_index.core.vector_stores.types import VectorStoreQueryResult
|
|
18
|
+
from llama_index.vector_stores.lancedb import LanceDBVectorStore
|
|
19
|
+
from llama_index.vector_stores.lancedb.base import TableNotFoundError
|
|
20
|
+
|
|
21
|
+
from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
22
|
+
BaseVectorStoreAdapter,
|
|
23
|
+
DocumentWithChunksAndEmbeddings,
|
|
24
|
+
SearchResult,
|
|
25
|
+
VectorStoreQuery,
|
|
26
|
+
)
|
|
27
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
28
|
+
from kiln_ai.datamodel.vector_store import (
|
|
29
|
+
VectorStoreConfig,
|
|
30
|
+
VectorStoreType,
|
|
31
|
+
raise_exhaustive_enum_error,
|
|
32
|
+
)
|
|
33
|
+
from kiln_ai.utils.config import Config
|
|
34
|
+
from kiln_ai.utils.env import temporary_env
|
|
35
|
+
from kiln_ai.utils.uuid import string_to_uuid
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LanceDBAdapterQueryKwargs(TypedDict):
|
|
41
|
+
similarity_top_k: int
|
|
42
|
+
query_str: Optional[str]
|
|
43
|
+
query_embedding: Optional[List[float]]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LanceDBAdapter(BaseVectorStoreAdapter):
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
rag_config: RagConfig,
|
|
50
|
+
vector_store_config: VectorStoreConfig,
|
|
51
|
+
):
|
|
52
|
+
super().__init__(rag_config, vector_store_config)
|
|
53
|
+
self.config_properties = self.vector_store_config.lancedb_properties
|
|
54
|
+
|
|
55
|
+
kwargs: Dict[str, Any] = {}
|
|
56
|
+
if vector_store_config.lancedb_properties.nprobes is not None:
|
|
57
|
+
kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
|
|
58
|
+
|
|
59
|
+
self.lancedb_vector_store = LanceDBVectorStore(
|
|
60
|
+
mode="create",
|
|
61
|
+
uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
|
|
62
|
+
query_type=self.query_type,
|
|
63
|
+
overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
|
|
64
|
+
vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
|
|
65
|
+
text_key=vector_store_config.lancedb_properties.text_key,
|
|
66
|
+
doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
|
|
67
|
+
**kwargs,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self._index = None
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def index(self) -> VectorStoreIndex:
|
|
74
|
+
"""
|
|
75
|
+
- VectorStoreIndex is a wrapper around the underlying LanceDBVectorStore.
|
|
76
|
+
It exposes higher level operations, and you need to make sure our
|
|
77
|
+
implementation mirrors the upstream llama_index logic that it expects to have available (e.g. ref_doc_id)
|
|
78
|
+
- VectorStoreIndex throws on initialization if the underlying vector store is empty due to schema mismatch;
|
|
79
|
+
make sure there is data in the underlying vector store before calling this
|
|
80
|
+
"""
|
|
81
|
+
if self._index is not None:
|
|
82
|
+
return self._index
|
|
83
|
+
|
|
84
|
+
storage_context = StorageContext.from_defaults(
|
|
85
|
+
vector_store=self.lancedb_vector_store
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# embed_model=None in the constructor should initialize the embed model to a mock
|
|
89
|
+
# like it does elsewhere in llama_index. However, that is not happening for VectorStoreIndex
|
|
90
|
+
# because the constructor overrides None with "default" and tries to load OpenAI and
|
|
91
|
+
# expects OPENAI_API_KEY to be set
|
|
92
|
+
#
|
|
93
|
+
# Since our own implementation does not actually use OpenAI, we set a fake API key just to
|
|
94
|
+
# avoid the error
|
|
95
|
+
with temporary_env("OPENAI_API_KEY", "fake-api-key"):
|
|
96
|
+
self._index = VectorStoreIndex(
|
|
97
|
+
[],
|
|
98
|
+
storage_context=storage_context,
|
|
99
|
+
embed_model=None,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return self._index
|
|
103
|
+
|
|
104
|
+
async def delete_nodes_by_document_id(self, document_id: str) -> None:
|
|
105
|
+
# higher level operation that requires ref_doc_id to be set on the nodes
|
|
106
|
+
# which is set through the source node relationship
|
|
107
|
+
try:
|
|
108
|
+
self.index.delete_ref_doc(document_id)
|
|
109
|
+
except TableNotFoundError:
|
|
110
|
+
# Table doesn't exist yet, so there's nothing to delete
|
|
111
|
+
logger.debug(
|
|
112
|
+
f"Table not found while deleting nodes for document {document_id}, which is expected if the table does not exist yet"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
async def get_nodes_by_ids(self, node_ids: List[str]) -> List[BaseNode]:
|
|
116
|
+
try:
|
|
117
|
+
chunk_ids_in_database = await self.lancedb_vector_store.aget_nodes(
|
|
118
|
+
node_ids=node_ids
|
|
119
|
+
)
|
|
120
|
+
return chunk_ids_in_database
|
|
121
|
+
except TableNotFoundError:
|
|
122
|
+
logger.warning(
|
|
123
|
+
"Table not found while getting nodes by ids, which may be expected if the table does not exist yet",
|
|
124
|
+
)
|
|
125
|
+
return []
|
|
126
|
+
|
|
127
|
+
async def add_chunks_with_embeddings(
|
|
128
|
+
self,
|
|
129
|
+
doc_batch: list[DocumentWithChunksAndEmbeddings],
|
|
130
|
+
nodes_batch_size: int = 100,
|
|
131
|
+
) -> None:
|
|
132
|
+
if len(doc_batch) == 0:
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
node_batch: List[TextNode] = []
|
|
136
|
+
for doc in doc_batch:
|
|
137
|
+
document_id = doc.document_id
|
|
138
|
+
chunks = doc.chunks
|
|
139
|
+
embeddings = doc.embeddings
|
|
140
|
+
|
|
141
|
+
# the lancedb vector store implementation is sync (even though it has an async API)
|
|
142
|
+
# so we sleep to avoid blocking the event loop - that allows other async ops to run
|
|
143
|
+
await asyncio.sleep(0)
|
|
144
|
+
|
|
145
|
+
if len(embeddings) != len(chunks):
|
|
146
|
+
raise RuntimeError(
|
|
147
|
+
f"Number of embeddings ({len(embeddings)}) does not match number of chunks ({len(chunks)}) for document {document_id}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
chunk_count_for_document = len(chunks)
|
|
151
|
+
deterministic_chunk_ids = [
|
|
152
|
+
self.compute_deterministic_chunk_id(document_id, chunk_idx)
|
|
153
|
+
for chunk_idx in range(chunk_count_for_document)
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
# check if the chunk ids are already in the database
|
|
157
|
+
chunk_ids_in_database = await self.get_nodes_by_ids(deterministic_chunk_ids)
|
|
158
|
+
|
|
159
|
+
# we already have all the chunks for this document in the database
|
|
160
|
+
if len(chunk_ids_in_database) == chunk_count_for_document:
|
|
161
|
+
# free up event loop to avoid risk of looping for a long time
|
|
162
|
+
# without any real async ops releasing the event loop at all
|
|
163
|
+
# (get_nodes_by_ids implementation in llama_index is actually sync
|
|
164
|
+
# and it is slow)
|
|
165
|
+
continue
|
|
166
|
+
else:
|
|
167
|
+
# the chunks are different, which is because either:
|
|
168
|
+
# - an upstream sync conflict caused multiple chunked documents to be created and the incoming one
|
|
169
|
+
# is different; we need to delete all the chunks for this document otherwise there can be lingering stale chunks
|
|
170
|
+
# that are not in the incoming batch if current is longer than incoming
|
|
171
|
+
# - an incomplete indexing of this same chunked doc, upserting is enough to overwrite the current chunked doc fully
|
|
172
|
+
await self.delete_nodes_by_document_id(document_id)
|
|
173
|
+
|
|
174
|
+
chunks_text = await doc.chunked_document.load_chunks_text()
|
|
175
|
+
for chunk_idx, (chunk_text, embedding) in enumerate(
|
|
176
|
+
zip(chunks_text, embeddings)
|
|
177
|
+
):
|
|
178
|
+
node_batch.append(
|
|
179
|
+
TextNode(
|
|
180
|
+
id_=deterministic_chunk_ids[chunk_idx],
|
|
181
|
+
text=chunk_text,
|
|
182
|
+
embedding=embedding.vector,
|
|
183
|
+
metadata={
|
|
184
|
+
# metadata is populated by some internal llama_index logic
|
|
185
|
+
# that uses for example the source_node relationship
|
|
186
|
+
"kiln_doc_id": document_id,
|
|
187
|
+
"kiln_chunk_idx": chunk_idx,
|
|
188
|
+
#
|
|
189
|
+
# llama_index lancedb vector store automatically sets these metadata:
|
|
190
|
+
# "doc_id": "UUID node_id of the Source Node relationship",
|
|
191
|
+
# "document_id": "UUID node_id of the Source Node relationship",
|
|
192
|
+
# "ref_doc_id": "UUID node_id of the Source Node relationship"
|
|
193
|
+
#
|
|
194
|
+
# llama_index file loaders set these metadata, which would be useful to also support:
|
|
195
|
+
# "creation_date": "2025-09-03",
|
|
196
|
+
# "file_name": "file.pdf",
|
|
197
|
+
# "file_path": "/absolute/path/to/the/file.pdf",
|
|
198
|
+
# "file_size": 395154,
|
|
199
|
+
# "file_type": "application\/pdf",
|
|
200
|
+
# "last_modified_date": "2025-09-03",
|
|
201
|
+
# "page_label": "1",
|
|
202
|
+
},
|
|
203
|
+
relationships={
|
|
204
|
+
# when using the llama_index loaders, llama_index groups Nodes under Documents
|
|
205
|
+
# and relationships point to the Document (which is also a Node), which confusingly
|
|
206
|
+
# enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
|
|
207
|
+
# the Document structure is not something that is persisted, so it is fine here
|
|
208
|
+
# if we have a relationship to a node_id that does not exist in the db
|
|
209
|
+
NodeRelationship.SOURCE: RelatedNodeInfo(
|
|
210
|
+
node_id=document_id,
|
|
211
|
+
node_type="1",
|
|
212
|
+
metadata={},
|
|
213
|
+
),
|
|
214
|
+
},
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if len(node_batch) >= nodes_batch_size:
|
|
219
|
+
# async_add is currently not async, LanceDB has an async API but
|
|
220
|
+
# llama_index does not use it, so it is synchronous and blocking
|
|
221
|
+
# avoid calling with too many nodes at once
|
|
222
|
+
await self.lancedb_vector_store.async_add(node_batch)
|
|
223
|
+
node_batch.clear()
|
|
224
|
+
|
|
225
|
+
if node_batch:
|
|
226
|
+
await self.lancedb_vector_store.async_add(node_batch)
|
|
227
|
+
node_batch.clear()
|
|
228
|
+
|
|
229
|
+
def format_query_result(
|
|
230
|
+
self, query_result: VectorStoreQueryResult
|
|
231
|
+
) -> List[SearchResult]:
|
|
232
|
+
# Handle case where no results are found - return empty list
|
|
233
|
+
if (
|
|
234
|
+
query_result.ids is None
|
|
235
|
+
or query_result.nodes is None
|
|
236
|
+
or query_result.similarities is None
|
|
237
|
+
):
|
|
238
|
+
# If any of the fields are None (which shouldn't happen normally),
|
|
239
|
+
# return empty results instead of raising an error
|
|
240
|
+
return []
|
|
241
|
+
|
|
242
|
+
# If all fields exist but are empty lists, that's a valid empty result
|
|
243
|
+
if (
|
|
244
|
+
len(query_result.ids) == 0
|
|
245
|
+
and len(query_result.nodes) == 0
|
|
246
|
+
and len(query_result.similarities) == 0
|
|
247
|
+
):
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
if not (
|
|
251
|
+
len(query_result.ids)
|
|
252
|
+
== len(query_result.nodes)
|
|
253
|
+
== len(query_result.similarities)
|
|
254
|
+
):
|
|
255
|
+
raise ValueError("ids, nodes, and similarities must have the same length")
|
|
256
|
+
|
|
257
|
+
results = []
|
|
258
|
+
for _, node, similarity in zip(
|
|
259
|
+
query_result.ids or [],
|
|
260
|
+
query_result.nodes or [],
|
|
261
|
+
query_result.similarities or [],
|
|
262
|
+
):
|
|
263
|
+
if node.metadata is None:
|
|
264
|
+
raise ValueError("node.metadata must not be None")
|
|
265
|
+
document_id = node.metadata.get("kiln_doc_id")
|
|
266
|
+
if document_id is None:
|
|
267
|
+
raise ValueError("node.metadata.kiln_doc_id must not be None")
|
|
268
|
+
chunk_idx = node.metadata.get("kiln_chunk_idx")
|
|
269
|
+
if chunk_idx is None:
|
|
270
|
+
raise ValueError("node.metadata.kiln_chunk_idx must not be None")
|
|
271
|
+
results.append(
|
|
272
|
+
SearchResult(
|
|
273
|
+
document_id=document_id,
|
|
274
|
+
chunk_idx=chunk_idx,
|
|
275
|
+
chunk_text=node.get_content(),
|
|
276
|
+
similarity=similarity,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
def build_kwargs_for_query(
|
|
282
|
+
self, query: VectorStoreQuery
|
|
283
|
+
) -> LanceDBAdapterQueryKwargs:
|
|
284
|
+
kwargs: LanceDBAdapterQueryKwargs = {
|
|
285
|
+
"similarity_top_k": self.config_properties.similarity_top_k,
|
|
286
|
+
"query_str": None,
|
|
287
|
+
"query_embedding": None,
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
match self.query_type:
|
|
291
|
+
case "fts":
|
|
292
|
+
if query.query_string is None:
|
|
293
|
+
raise ValueError("query_string must be provided for fts search")
|
|
294
|
+
kwargs["query_str"] = query.query_string
|
|
295
|
+
case "hybrid":
|
|
296
|
+
if query.query_embedding is None or query.query_string is None:
|
|
297
|
+
raise ValueError(
|
|
298
|
+
"query_string and query_embedding must be provided for hybrid search"
|
|
299
|
+
)
|
|
300
|
+
kwargs["query_embedding"] = query.query_embedding
|
|
301
|
+
kwargs["query_str"] = query.query_string
|
|
302
|
+
case "vector":
|
|
303
|
+
if not query.query_embedding:
|
|
304
|
+
raise ValueError(
|
|
305
|
+
"query_embedding must be provided for vector search"
|
|
306
|
+
)
|
|
307
|
+
kwargs["query_embedding"] = query.query_embedding
|
|
308
|
+
case _:
|
|
309
|
+
raise_exhaustive_enum_error(self.query_type)
|
|
310
|
+
return kwargs
|
|
311
|
+
|
|
312
|
+
async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
|
|
313
|
+
try:
|
|
314
|
+
query_result = await self.lancedb_vector_store.aquery(
|
|
315
|
+
LlamaIndexVectorStoreQuery(
|
|
316
|
+
**self.build_kwargs_for_query(query),
|
|
317
|
+
),
|
|
318
|
+
query_type=self.query_type,
|
|
319
|
+
)
|
|
320
|
+
return self.format_query_result(query_result)
|
|
321
|
+
except TableNotFoundError as e:
|
|
322
|
+
logger.info("Vector store search returned no results: %s", e)
|
|
323
|
+
return []
|
|
324
|
+
except Warning as e:
|
|
325
|
+
msg = str(e).lower()
|
|
326
|
+
if ("query results are empty" in msg) or (
|
|
327
|
+
"empty" in msg and "result" in msg
|
|
328
|
+
):
|
|
329
|
+
logger.warning("Vector store search returned no results: %s", e)
|
|
330
|
+
return []
|
|
331
|
+
raise
|
|
332
|
+
|
|
333
|
+
def compute_deterministic_chunk_id(self, document_id: str, chunk_idx: int) -> str:
|
|
334
|
+
# the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
|
|
335
|
+
return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
|
|
336
|
+
|
|
337
|
+
async def count_records(self) -> int:
|
|
338
|
+
try:
|
|
339
|
+
table = self.lancedb_vector_store.table
|
|
340
|
+
if table is None:
|
|
341
|
+
raise ValueError("Table is not initialized")
|
|
342
|
+
count = table.count_rows()
|
|
343
|
+
return count
|
|
344
|
+
except TableNotFoundError:
|
|
345
|
+
return 0
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def query_type(self) -> Literal["fts", "hybrid", "vector"]:
|
|
349
|
+
match self.vector_store_config.store_type:
|
|
350
|
+
case VectorStoreType.LANCE_DB_FTS:
|
|
351
|
+
return "fts"
|
|
352
|
+
case VectorStoreType.LANCE_DB_HYBRID:
|
|
353
|
+
return "hybrid"
|
|
354
|
+
case VectorStoreType.LANCE_DB_VECTOR:
|
|
355
|
+
return "vector"
|
|
356
|
+
case _:
|
|
357
|
+
raise_exhaustive_enum_error(self.vector_store_config.store_type)
|
|
358
|
+
|
|
359
|
+
@staticmethod
|
|
360
|
+
def lancedb_path_for_config(rag_config: RagConfig) -> str:
|
|
361
|
+
data_dir = Path(Config.settings_dir())
|
|
362
|
+
if rag_config.id is None:
|
|
363
|
+
raise ValueError("Vector store config ID is required")
|
|
364
|
+
return str(data_dir / "rag_indexes" / "lancedb" / rag_config.id)
|
|
365
|
+
|
|
366
|
+
async def destroy(self) -> None:
|
|
367
|
+
lancedb_path = LanceDBAdapter.lancedb_path_for_config(self.rag_config)
|
|
368
|
+
shutil.rmtree(lancedb_path)
|
|
369
|
+
|
|
370
|
+
async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
|
|
371
|
+
tbl = self.lancedb_vector_store.table
|
|
372
|
+
if tbl is None:
|
|
373
|
+
raise ValueError("Table is not initialized")
|
|
374
|
+
|
|
375
|
+
for batch in tbl.search().to_batches(100):
|
|
376
|
+
batch = batch.to_pandas()
|
|
377
|
+
|
|
378
|
+
rows_to_delete = []
|
|
379
|
+
for _, row in batch.iterrows():
|
|
380
|
+
kiln_doc_id = row["metadata"]["kiln_doc_id"]
|
|
381
|
+
if kiln_doc_id not in document_ids:
|
|
382
|
+
kiln_chunk_idx = row["metadata"]["kiln_chunk_idx"]
|
|
383
|
+
record_id = self.compute_deterministic_chunk_id(
|
|
384
|
+
kiln_doc_id, kiln_chunk_idx
|
|
385
|
+
)
|
|
386
|
+
rows_to_delete.append(record_id)
|
|
387
|
+
|
|
388
|
+
if rows_to_delete:
|
|
389
|
+
self.lancedb_vector_store.delete_nodes(rows_to_delete)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from typing import List, Set, Tuple
|
|
2
|
+
from unittest.mock import MagicMock
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
7
|
+
BaseVectorStoreAdapter,
|
|
8
|
+
DocumentWithChunksAndEmbeddings,
|
|
9
|
+
SearchResult,
|
|
10
|
+
VectorStoreQuery,
|
|
11
|
+
)
|
|
12
|
+
from kiln_ai.datamodel.chunk import Chunk, ChunkedDocument
|
|
13
|
+
from kiln_ai.datamodel.embedding import ChunkEmbeddings, Embedding
|
|
14
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
15
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestBaseVectorStoreAdapter:
|
|
19
|
+
"""Test the base vector store adapter abstract class."""
|
|
20
|
+
|
|
21
|
+
def test_init_stores_config(self):
|
|
22
|
+
"""Test that the adapter stores the vector store config."""
|
|
23
|
+
|
|
24
|
+
# Create a concrete implementation for testing
|
|
25
|
+
class ConcreteAdapter(BaseVectorStoreAdapter):
|
|
26
|
+
async def add_chunks_with_embeddings(
|
|
27
|
+
self,
|
|
28
|
+
records: List[Tuple[str, ChunkedDocument, ChunkEmbeddings]],
|
|
29
|
+
) -> None:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
async def search(self, query: VectorStoreQuery) -> List[SearchResult]:
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
async def count_records(self) -> int:
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
async def destroy(self) -> None:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
async def delete_nodes_not_in_set(self, document_ids: Set[str]) -> None:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
config = MagicMock(spec=VectorStoreConfig)
|
|
45
|
+
adapter = ConcreteAdapter(MagicMock(spec=RagConfig), config)
|
|
46
|
+
assert adapter.vector_store_config is config
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TestVectorStoreQuery:
|
|
50
|
+
"""Test the VectorStoreQuery model."""
|
|
51
|
+
|
|
52
|
+
def test_default_values(self):
|
|
53
|
+
"""Test that the query model has correct default values."""
|
|
54
|
+
query = VectorStoreQuery()
|
|
55
|
+
assert query.query_string is None
|
|
56
|
+
assert query.query_embedding is None
|
|
57
|
+
|
|
58
|
+
def test_with_query_string(self):
|
|
59
|
+
"""Test creating a query with a query string."""
|
|
60
|
+
query = VectorStoreQuery(query_string="test query")
|
|
61
|
+
assert query.query_string == "test query"
|
|
62
|
+
assert query.query_embedding is None
|
|
63
|
+
|
|
64
|
+
def test_with_query_embedding(self):
|
|
65
|
+
"""Test creating a query with an embedding."""
|
|
66
|
+
embedding = [0.1, 0.2, 0.3]
|
|
67
|
+
query = VectorStoreQuery(query_embedding=embedding)
|
|
68
|
+
assert query.query_string is None
|
|
69
|
+
assert query.query_embedding == embedding
|
|
70
|
+
|
|
71
|
+
def test_with_both_values(self):
|
|
72
|
+
"""Test creating a query with both string and embedding."""
|
|
73
|
+
embedding = [0.1, 0.2, 0.3]
|
|
74
|
+
query = VectorStoreQuery(query_string="test query", query_embedding=embedding)
|
|
75
|
+
assert query.query_string == "test query"
|
|
76
|
+
assert query.query_embedding == embedding
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TestSearchResult:
|
|
80
|
+
"""Test the SearchResult model."""
|
|
81
|
+
|
|
82
|
+
def test_required_fields(self):
|
|
83
|
+
"""Test creating a search result with required fields."""
|
|
84
|
+
result = SearchResult(
|
|
85
|
+
document_id="doc123",
|
|
86
|
+
chunk_text="This is a test chunk",
|
|
87
|
+
similarity=0.95,
|
|
88
|
+
chunk_idx=0,
|
|
89
|
+
)
|
|
90
|
+
assert result.document_id == "doc123"
|
|
91
|
+
assert result.chunk_text == "This is a test chunk"
|
|
92
|
+
assert result.similarity == 0.95
|
|
93
|
+
|
|
94
|
+
def test_optional_similarity(self):
|
|
95
|
+
"""Test that similarity can be None."""
|
|
96
|
+
result = SearchResult(
|
|
97
|
+
document_id="doc123",
|
|
98
|
+
chunk_text="This is a test chunk",
|
|
99
|
+
similarity=None,
|
|
100
|
+
chunk_idx=0,
|
|
101
|
+
)
|
|
102
|
+
assert result.document_id == "doc123"
|
|
103
|
+
assert result.chunk_text == "This is a test chunk"
|
|
104
|
+
assert result.similarity is None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_document_with_chunks_and_embeddings_properties():
|
|
108
|
+
"""Test that DocumentWithChunksAndEmbeddings virtual properties work correctly."""
|
|
109
|
+
# Create mock chunked document with chunks
|
|
110
|
+
mock_chunk1 = MagicMock(spec=Chunk)
|
|
111
|
+
mock_chunk2 = MagicMock(spec=Chunk)
|
|
112
|
+
mock_chunked_document = MagicMock(spec=ChunkedDocument)
|
|
113
|
+
mock_chunked_document.chunks = [mock_chunk1, mock_chunk2]
|
|
114
|
+
|
|
115
|
+
# Create mock chunk embeddings with embeddings
|
|
116
|
+
mock_embedding1 = MagicMock(spec=Embedding)
|
|
117
|
+
mock_embedding2 = MagicMock(spec=Embedding)
|
|
118
|
+
mock_chunk_embeddings = MagicMock(spec=ChunkEmbeddings)
|
|
119
|
+
mock_chunk_embeddings.embeddings = [mock_embedding1, mock_embedding2]
|
|
120
|
+
|
|
121
|
+
# Create DocumentWithChunksAndEmbeddings instance
|
|
122
|
+
doc_with_chunks = DocumentWithChunksAndEmbeddings(
|
|
123
|
+
document_id="test-doc-123",
|
|
124
|
+
chunked_document=mock_chunked_document,
|
|
125
|
+
chunk_embeddings=mock_chunk_embeddings,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Test that properties return the correct values
|
|
129
|
+
assert doc_with_chunks.document_id == "test-doc-123"
|
|
130
|
+
assert doc_with_chunks.chunks == [mock_chunk1, mock_chunk2]
|
|
131
|
+
assert doc_with_chunks.embeddings == [mock_embedding1, mock_embedding2]
|
|
132
|
+
|
|
133
|
+
# Test that properties are read-only (no setters)
|
|
134
|
+
with pytest.raises(AttributeError):
|
|
135
|
+
doc_with_chunks.chunks = []
|
|
136
|
+
|
|
137
|
+
with pytest.raises(AttributeError):
|
|
138
|
+
doc_with_chunks.embeddings = []
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_document_with_chunks_and_embeddings_empty_lists():
|
|
142
|
+
"""Test DocumentWithChunksAndEmbeddings with empty chunks and embeddings."""
|
|
143
|
+
# Create mock objects with empty lists
|
|
144
|
+
mock_chunked_document = MagicMock(spec=ChunkedDocument)
|
|
145
|
+
mock_chunked_document.chunks = []
|
|
146
|
+
|
|
147
|
+
mock_chunk_embeddings = MagicMock(spec=ChunkEmbeddings)
|
|
148
|
+
mock_chunk_embeddings.embeddings = []
|
|
149
|
+
|
|
150
|
+
# Create DocumentWithChunksAndEmbeddings instance
|
|
151
|
+
doc_with_chunks = DocumentWithChunksAndEmbeddings(
|
|
152
|
+
document_id="empty-doc",
|
|
153
|
+
chunked_document=mock_chunked_document,
|
|
154
|
+
chunk_embeddings=mock_chunk_embeddings,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Test that properties return empty lists
|
|
158
|
+
assert doc_with_chunks.document_id == "empty-doc"
|
|
159
|
+
assert doc_with_chunks.chunks == []
|
|
160
|
+
assert doc_with_chunks.embeddings == []
|