PyPI - kiln-ai - Versions diffs - 0.22.0__py3-none-any.whl → 0.22.1__py3-none-any.whl - Mend

kiln-ai 0.22.0py3-none-any.whl → 0.22.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (18) hide show

kiln_ai/adapters/model_adapters/litellm_adapter.py +6 -2
kiln_ai/adapters/vector_store/lancedb_adapter.py +24 -70
kiln_ai/adapters/vector_store/lancedb_helpers.py +101 -0
kiln_ai/adapters/vector_store/test_lancedb_adapter.py +9 -16
kiln_ai/adapters/vector_store/test_lancedb_helpers.py +142 -0
kiln_ai/adapters/vector_store_loaders/__init__.py +0 -0
kiln_ai/adapters/vector_store_loaders/test_lancedb_loader.py +282 -0
kiln_ai/adapters/vector_store_loaders/test_vector_store_loader.py +544 -0
kiln_ai/adapters/vector_store_loaders/vector_store_loader.py +91 -0
kiln_ai/datamodel/tool_id.py +13 -0
kiln_ai/tools/base_tool.py +18 -3
kiln_ai/tools/kiln_task_tool.py +6 -2
kiln_ai/tools/mcp_server_tool.py +6 -4
kiln_ai/tools/rag_tools.py +7 -3
{kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/METADATA +77 -1
{kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/RECORD +18 -12
{kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/WHEEL +0 -0
{kiln_ai-0.22.0.dist-info → kiln_ai-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/adapters/model_adapters/litellm_adapter.py CHANGED Viewed

@@ -31,7 +31,11 @@ from kiln_ai.adapters.model_adapters.base_adapter import (
 )
 from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
 from kiln_ai.datamodel.json_schema import validate_schema_with_value_error
-from kiln_ai.tools.base_tool import KilnToolInterface, ToolCallContext
+from kiln_ai.tools.base_tool import (
+    KilnToolInterface,
+    ToolCallContext,
+    ToolCallDefinition,
+)
 from kiln_ai.tools.kiln_task_tool import KilnTaskToolResult
 from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 from kiln_ai.utils.litellm import get_litellm_provider_info
@@ -560,7 +564,7 @@ class LiteLlmAdapter(BaseAdapter):
             self._cached_available_tools = await self.available_tools()
         return self._cached_available_tools
-    async def litellm_tools(self) -> list[Dict]:
+    async def litellm_tools(self) -> list[ToolCallDefinition]:
         available_tools = await self.cached_available_tools()
         # LiteLLM takes the standard OpenAI-compatible tool call format

kiln_ai/adapters/vector_store/lancedb_adapter.py CHANGED Viewed

@@ -5,12 +5,7 @@ from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Set, TypedDict
 from llama_index.core import StorageContext, VectorStoreIndex
-from llama_index.core.schema import (
-    BaseNode,
-    NodeRelationship,
-    RelatedNodeInfo,
-    TextNode,
-)
+from llama_index.core.schema import BaseNode, TextNode
 from llama_index.core.vector_stores.types import (
     VectorStoreQuery as LlamaIndexVectorStoreQuery,
 )
@@ -24,15 +19,19 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
     SearchResult,
     VectorStoreQuery,
 )
+from kiln_ai.adapters.vector_store.lancedb_helpers import (
+    convert_to_llama_index_node,
+    deterministic_chunk_id,
+    lancedb_construct_from_config,
+    store_type_to_lancedb_query_type,
+)
 from kiln_ai.datamodel.rag import RagConfig
 from kiln_ai.datamodel.vector_store import (
     VectorStoreConfig,
-    VectorStoreType,
     raise_exhaustive_enum_error,
 )
 from kiln_ai.utils.config import Config
 from kiln_ai.utils.env import temporary_env
-from kiln_ai.utils.uuid import string_to_uuid
 logger = logging.getLogger(__name__)
@@ -48,6 +47,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
         self,
         rag_config: RagConfig,
         vector_store_config: VectorStoreConfig,
+        lancedb_vector_store: LanceDBVectorStore | None = None,
     ):
         super().__init__(rag_config, vector_store_config)
         self.config_properties = self.vector_store_config.lancedb_properties
@@ -56,17 +56,15 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
         if vector_store_config.lancedb_properties.nprobes is not None:
             kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
-        self.lancedb_vector_store = LanceDBVectorStore(
-            mode="create",
-            uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
-            query_type=self.query_type,
-            overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
-            vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
-            text_key=vector_store_config.lancedb_properties.text_key,
-            doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
-            **kwargs,
+        # allow overriding the vector store with a custom one, useful for user loading into an arbitrary
+        # deployment
+        self.lancedb_vector_store = (
+            lancedb_vector_store
+            or lancedb_construct_from_config(
+                vector_store_config,
+                uri=LanceDBAdapter.lancedb_path_for_config(rag_config),
+            )
         )
         self._index = None
     @property
@@ -149,7 +147,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
             chunk_count_for_document = len(chunks)
             deterministic_chunk_ids = [
-                self.compute_deterministic_chunk_id(document_id, chunk_idx)
+                deterministic_chunk_id(document_id, chunk_idx)
                 for chunk_idx in range(chunk_count_for_document)
             ]
@@ -176,42 +174,12 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
                 zip(chunks_text, embeddings)
             ):
                 node_batch.append(
-                    TextNode(
-                        id_=deterministic_chunk_ids[chunk_idx],
+                    convert_to_llama_index_node(
+                        document_id=document_id,
+                        chunk_idx=chunk_idx,
+                        node_id=deterministic_chunk_id(document_id, chunk_idx),
                         text=chunk_text,
-                        embedding=embedding.vector,
-                        metadata={
-                            # metadata is populated by some internal llama_index logic
-                            # that uses for example the source_node relationship
-                            "kiln_doc_id": document_id,
-                            "kiln_chunk_idx": chunk_idx,
-                            #
-                            # llama_index lancedb vector store automatically sets these metadata:
-                            # "doc_id": "UUID node_id of the Source Node relationship",
-                            # "document_id": "UUID node_id of the Source Node relationship",
-                            # "ref_doc_id": "UUID node_id of the Source Node relationship"
-                            #
-                            # llama_index file loaders set these metadata, which would be useful to also support:
-                            # "creation_date": "2025-09-03",
-                            # "file_name": "file.pdf",
-                            # "file_path": "/absolute/path/to/the/file.pdf",
-                            # "file_size": 395154,
-                            # "file_type": "application\/pdf",
-                            # "last_modified_date": "2025-09-03",
-                            # "page_label": "1",
-                        },
-                        relationships={
-                            # when using the llama_index loaders, llama_index groups Nodes under Documents
-                            # and relationships point to the Document (which is also a Node), which confusingly
-                            # enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
-                            # the Document structure is not something that is persisted, so it is fine here
-                            # if we have a relationship to a node_id that does not exist in the db
-                            NodeRelationship.SOURCE: RelatedNodeInfo(
-                                node_id=document_id,
-                                node_type="1",
-                                metadata={},
-                            ),
-                        },
+                        vector=embedding.vector,
                     )
                 )
@@ -330,10 +298,6 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
                 return []
             raise
-    def compute_deterministic_chunk_id(self, document_id: str, chunk_idx: int) -> str:
-        # the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
-        return str(string_to_uuid(f"{document_id}::{chunk_idx}"))
     async def count_records(self) -> int:
         try:
             table = self.lancedb_vector_store.table
@@ -346,15 +310,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
     @property
     def query_type(self) -> Literal["fts", "hybrid", "vector"]:
-        match self.vector_store_config.store_type:
-            case VectorStoreType.LANCE_DB_FTS:
-                return "fts"
-            case VectorStoreType.LANCE_DB_HYBRID:
-                return "hybrid"
-            case VectorStoreType.LANCE_DB_VECTOR:
-                return "vector"
-            case _:
-                raise_exhaustive_enum_error(self.vector_store_config.store_type)
+        return store_type_to_lancedb_query_type(self.vector_store_config.store_type)
     @staticmethod
     def lancedb_path_for_config(rag_config: RagConfig) -> str:
@@ -380,9 +336,7 @@ class LanceDBAdapter(BaseVectorStoreAdapter):
                 kiln_doc_id = row["metadata"]["kiln_doc_id"]
                 if kiln_doc_id not in document_ids:
                     kiln_chunk_idx = row["metadata"]["kiln_chunk_idx"]
-                    record_id = self.compute_deterministic_chunk_id(
-                        kiln_doc_id, kiln_chunk_idx
-                    )
+                    record_id = deterministic_chunk_id(kiln_doc_id, kiln_chunk_idx)
                     rows_to_delete.append(record_id)
             if rows_to_delete:

kiln_ai/adapters/vector_store/lancedb_helpers.py ADDED Viewed

@@ -0,0 +1,101 @@
+from typing import Any, Dict, List, Literal
+from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode
+from llama_index.vector_stores.lancedb import LanceDBVectorStore
+from kiln_ai.datamodel.vector_store import (
+    VectorStoreConfig,
+    VectorStoreType,
+    raise_exhaustive_enum_error,
+)
+from kiln_ai.utils.uuid import string_to_uuid
+def store_type_to_lancedb_query_type(
+    store_type: VectorStoreType,
+) -> Literal["fts", "hybrid", "vector"]:
+    match store_type:
+        case VectorStoreType.LANCE_DB_FTS:
+            return "fts"
+        case VectorStoreType.LANCE_DB_HYBRID:
+            return "hybrid"
+        case VectorStoreType.LANCE_DB_VECTOR:
+            return "vector"
+        case _:
+            raise_exhaustive_enum_error(store_type)
+def lancedb_construct_from_config(
+    vector_store_config: VectorStoreConfig,
+    uri: str,
+    **extra_params: Any,
+) -> LanceDBVectorStore:
+    """Construct a LanceDBVectorStore from a VectorStoreConfig."""
+    kwargs: Dict[str, Any] = {**extra_params}
+    if (
+        vector_store_config.lancedb_properties.nprobes is not None
+        and "nprobes" not in kwargs
+    ):
+        kwargs["nprobes"] = vector_store_config.lancedb_properties.nprobes
+    return LanceDBVectorStore(
+        mode="create",
+        query_type=store_type_to_lancedb_query_type(vector_store_config.store_type),
+        overfetch_factor=vector_store_config.lancedb_properties.overfetch_factor,
+        vector_column_name=vector_store_config.lancedb_properties.vector_column_name,
+        text_key=vector_store_config.lancedb_properties.text_key,
+        doc_id_key=vector_store_config.lancedb_properties.doc_id_key,
+        uri=uri,
+        **kwargs,
+    )
+def convert_to_llama_index_node(
+    document_id: str,
+    chunk_idx: int,
+    node_id: str,
+    text: str,
+    vector: List[float],
+) -> TextNode:
+    return TextNode(
+        id_=node_id,
+        text=text,
+        embedding=vector,
+        metadata={
+            # metadata is populated by some internal llama_index logic
+            # that uses for example the source_node relationship
+            "kiln_doc_id": document_id,
+            "kiln_chunk_idx": chunk_idx,
+            #
+            # llama_index lancedb vector store automatically sets these metadata:
+            # "doc_id": "UUID node_id of the Source Node relationship",
+            # "document_id": "UUID node_id of the Source Node relationship",
+            # "ref_doc_id": "UUID node_id of the Source Node relationship"
+            #
+            # llama_index file loaders set these metadata, which would be useful to also support:
+            # "creation_date": "2025-09-03",
+            # "file_name": "file.pdf",
+            # "file_path": "/absolute/path/to/the/file.pdf",
+            # "file_size": 395154,
+            # "file_type": "application\/pdf",
+            # "last_modified_date": "2025-09-03",
+            # "page_label": "1",
+        },
+        relationships={
+            # when using the llama_index loaders, llama_index groups Nodes under Documents
+            # and relationships point to the Document (which is also a Node), which confusingly
+            # enough does not map to an actual file (for a PDF, a Document is a page of the PDF)
+            # the Document structure is not something that is persisted, so it is fine here
+            # if we have a relationship to a node_id that does not exist in the db
+            NodeRelationship.SOURCE: RelatedNodeInfo(
+                node_id=document_id,
+                node_type="1",
+                metadata={},
+            ),
+        },
+    )
+def deterministic_chunk_id(document_id: str, chunk_idx: int) -> str:
+    # the id_ of the Node must be a UUID string, otherwise llama_index / LanceDB fails downstream
+    return str(string_to_uuid(f"{document_id}::{chunk_idx}"))

kiln_ai/adapters/vector_store/test_lancedb_adapter.py CHANGED Viewed

@@ -17,6 +17,7 @@ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
     VectorStoreQuery,
 )
 from kiln_ai.adapters.vector_store.lancedb_adapter import LanceDBAdapter
+from kiln_ai.adapters.vector_store.lancedb_helpers import deterministic_chunk_id
 from kiln_ai.adapters.vector_store.vector_store_registry import (
     vector_store_adapter_for_config,
 )
@@ -925,9 +926,7 @@ async def test_get_nodes_by_ids_functionality(
     await adapter.add_chunks_with_embeddings([mock_chunked_documents[0]])  # doc_001
     # Test getting nodes by IDs - compute expected IDs
-    expected_ids = [
-        adapter.compute_deterministic_chunk_id("doc_001", i) for i in range(4)
-    ]
+    expected_ids = [deterministic_chunk_id("doc_001", i) for i in range(4)]
     # Get nodes by IDs
     retrieved_nodes = await adapter.get_nodes_by_ids(expected_ids)
@@ -943,7 +942,7 @@ async def test_get_nodes_by_ids_functionality(
         assert len(node.get_content()) > 0
     # Test with non-existent IDs
-    fake_ids = [adapter.compute_deterministic_chunk_id("fake_doc", i) for i in range(2)]
+    fake_ids = [deterministic_chunk_id("fake_doc", i) for i in range(2)]
     retrieved_fake = await adapter.get_nodes_by_ids(fake_ids)
     assert len(retrieved_fake) == 0
@@ -1019,7 +1018,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
     # Test the UUID scheme: document_id::chunk_idx
     for chunk_idx in range(4):
         # Compute expected ID using the same scheme as the adapter
-        expected_id = adapter.compute_deterministic_chunk_id("doc_001", chunk_idx)
+        expected_id = deterministic_chunk_id("doc_001", chunk_idx)
         # Retrieve the specific node by ID
         retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
@@ -1053,7 +1052,7 @@ async def test_uuid_scheme_retrieval_and_node_properties(
     # Test retrieval of doc_002 chunks
     for chunk_idx in range(4):
-        expected_id = adapter.compute_deterministic_chunk_id("doc_002", chunk_idx)
+        expected_id = deterministic_chunk_id("doc_002", chunk_idx)
         retrieved_nodes = await adapter.get_nodes_by_ids([expected_id])
         assert len(retrieved_nodes) == 1
@@ -1080,25 +1079,19 @@ async def test_deterministic_chunk_id_consistency(
     create_rag_config_factory,
 ):
     """Test that the deterministic chunk ID generation is consistent."""
-    rag_config = create_rag_config_factory(fts_vector_store_config, embedding_config)
-    adapter = LanceDBAdapter(
-        rag_config,
-        fts_vector_store_config,
-    )
     # Test that the same document_id and chunk_idx always produce the same UUID
     doc_id = "test_doc_123"
     chunk_idx = 5
-    id1 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
-    id2 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx)
+    id1 = deterministic_chunk_id(doc_id, chunk_idx)
+    id2 = deterministic_chunk_id(doc_id, chunk_idx)
     assert id1 == id2
     # Test that different inputs produce different UUIDs
-    id3 = adapter.compute_deterministic_chunk_id(doc_id, chunk_idx + 1)
-    id4 = adapter.compute_deterministic_chunk_id(doc_id + "_different", chunk_idx)
+    id3 = deterministic_chunk_id(doc_id, chunk_idx + 1)
+    id4 = deterministic_chunk_id(doc_id + "_different", chunk_idx)
     assert id1 != id3
     assert id1 != id4

kiln_ai/adapters/vector_store/test_lancedb_helpers.py ADDED Viewed

@@ -0,0 +1,142 @@
+from unittest.mock import patch
+import pytest
+from kiln_ai.adapters.vector_store.lancedb_helpers import (
+    convert_to_llama_index_node,
+    deterministic_chunk_id,
+    lancedb_construct_from_config,
+    store_type_to_lancedb_query_type,
+)
+from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
+from kiln_ai.utils.uuid import string_to_uuid
+class _FakeLanceDBVectorStore:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+def _base_properties(nprobes: int | None = None) -> dict[str, str | int | float | None]:
+    props: dict[str, str | int | float | None] = {
+        "similarity_top_k": 5,
+        "overfetch_factor": 2,
+        "vector_column_name": "vec",
+        "text_key": "text",
+        "doc_id_key": "doc_id",
+    }
+    if nprobes is not None:
+        props["nprobes"] = nprobes
+    return props
+def _make_config(
+    store_type: VectorStoreType, nprobes: int | None = None
+) -> VectorStoreConfig:
+    return VectorStoreConfig(
+        name="test_store",
+        description=None,
+        store_type=store_type,
+        properties=_base_properties(nprobes),
+    )
+def test_store_type_to_lancedb_query_type_mapping():
+    assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_FTS) == "fts"
+    assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_HYBRID) == "hybrid"
+    assert store_type_to_lancedb_query_type(VectorStoreType.LANCE_DB_VECTOR) == "vector"
+def test_store_type_to_lancedb_query_type_unsupported_raises():
+    with pytest.raises(Exception):
+        store_type_to_lancedb_query_type("unsupported")  # type: ignore[arg-type]
+def test_lancedb_construct_from_config_includes_nprobes():
+    with patch(
+        "kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
+        new=_FakeLanceDBVectorStore,
+    ):
+        cfg = _make_config(VectorStoreType.LANCE_DB_VECTOR, nprobes=7)
+        result = lancedb_construct_from_config(
+            vector_store_config=cfg,
+            uri="memory://",
+            api_key="k",
+            region="r",
+            table_name="t",
+        )
+    assert isinstance(result, _FakeLanceDBVectorStore)
+    kwargs = result.kwargs
+    assert kwargs["mode"] == "create"
+    assert kwargs["uri"] == "memory://"
+    assert kwargs["query_type"] == "vector"
+    assert kwargs["overfetch_factor"] == 2
+    assert kwargs["vector_column_name"] == "vec"
+    assert kwargs["text_key"] == "text"
+    assert kwargs["doc_id_key"] == "doc_id"
+    assert kwargs["api_key"] == "k"
+    assert kwargs["region"] == "r"
+    assert kwargs["table_name"] == "t"
+    # extra optional kwarg present when provided
+    assert kwargs["nprobes"] == 7
+def test_lancedb_construct_from_config_omits_nprobes_when_none():
+    with patch(
+        "kiln_ai.adapters.vector_store.lancedb_helpers.LanceDBVectorStore",
+        new=_FakeLanceDBVectorStore,
+    ):
+        cfg = _make_config(VectorStoreType.LANCE_DB_FTS, nprobes=None)
+        result = lancedb_construct_from_config(
+            vector_store_config=cfg,
+            uri="memory://",
+            api_key=None,
+            region=None,
+            table_name=None,
+        )
+    assert isinstance(result, _FakeLanceDBVectorStore)
+    kwargs = result.kwargs
+    assert kwargs["query_type"] == "fts"
+    assert "nprobes" not in kwargs
+def test_convert_to_llama_index_node_builds_expected_structure():
+    node = convert_to_llama_index_node(
+        document_id="doc-123",
+        chunk_idx=0,
+        node_id="11111111-1111-5111-8111-111111111111",
+        text="hello",
+        vector=[0.1, 0.2],
+    )
+    assert node.id_ == "11111111-1111-5111-8111-111111111111"
+    assert node.text == "hello"
+    assert node.embedding == [0.1, 0.2]
+    assert node.metadata["kiln_doc_id"] == "doc-123"
+    assert node.metadata["kiln_chunk_idx"] == 0
+    # relationship exists and points to the source document id
+    from llama_index.core.schema import NodeRelationship, RelatedNodeInfo
+    assert NodeRelationship.SOURCE in node.relationships
+    related = node.relationships[NodeRelationship.SOURCE]
+    assert isinstance(related, RelatedNodeInfo)
+    assert related.node_id == "doc-123"
+    assert related.node_type == "1"
+    assert isinstance(related.metadata, dict)
+def test_deterministic_chunk_id_uses_uuid_v5_namespace():
+    doc_id = "doc-abc"
+    idx = 3
+    expected = str(string_to_uuid(f"{doc_id}::{idx}"))
+    assert deterministic_chunk_id(doc_id, idx) == expected
+    # call again to ensure the same value is returned
+    assert deterministic_chunk_id(doc_id, idx) == expected

kiln_ai/adapters/vector_store_loaders/__init__.py ADDED Viewed

File without changes

kiln-ai 0.22.0__py3-none-any.whl → 0.22.1__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.22.0py3-none-any.whl → 0.22.1py3-none-any.whl