kiln-ai 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +6 -0
- kiln_ai/adapters/adapter_registry.py +43 -226
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/eval_runner.py +6 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -3
- kiln_ai/adapters/eval/test_g_eval.py +1 -1
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +192 -0
- kiln_ai/adapters/ml_model_list.py +382 -4
- kiln_ai/adapters/model_adapters/litellm_adapter.py +7 -69
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +1 -1
- kiln_ai/adapters/model_adapters/test_structured_output.py +3 -1
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/provider_tools.py +190 -46
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/test_adapter_registry.py +579 -86
- kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
- kiln_ai/adapters/test_ml_model_list.py +212 -0
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +199 -8
- kiln_ai/adapters/test_remote_config.py +551 -56
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +16 -13
- kiln_ai/datamodel/basemodel.py +170 -1
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +27 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/extraction.py +303 -0
- kiln_ai/datamodel/project.py +33 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +1 -1
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +470 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_tool_id.py +81 -0
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +22 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/mcp_session_manager.py +4 -1
- kiln_ai/tools/rag_tools.py +157 -0
- kiln_ai/tools/test_mcp_session_manager.py +1 -1
- kiln_ai/tools/test_rag_tools.py +848 -0
- kiln_ai/tools/test_tool_registry.py +91 -2
- kiln_ai/tools/tool_registry.py +21 -0
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +2 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/pdf_utils.py +38 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_pdf_utils.py +73 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +7 -1
- kiln_ai-0.21.0.dist-info/RECORD +211 -0
- kiln_ai-0.20.1.dist-info/RECORD +0 -138
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from functools import cached_property
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from kiln_ai.adapters.embedding.base_embedding_adapter import BaseEmbeddingAdapter
|
|
7
|
+
from kiln_ai.adapters.embedding.embedding_registry import embedding_adapter_from_type
|
|
8
|
+
from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
|
|
9
|
+
BaseVectorStoreAdapter,
|
|
10
|
+
SearchResult,
|
|
11
|
+
VectorStoreQuery,
|
|
12
|
+
)
|
|
13
|
+
from kiln_ai.adapters.vector_store.vector_store_registry import (
|
|
14
|
+
vector_store_adapter_for_config,
|
|
15
|
+
)
|
|
16
|
+
from kiln_ai.datamodel.embedding import EmbeddingConfig
|
|
17
|
+
from kiln_ai.datamodel.project import Project
|
|
18
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
19
|
+
from kiln_ai.datamodel.tool_id import ToolId
|
|
20
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
|
|
21
|
+
from kiln_ai.tools.base_tool import KilnToolInterface
|
|
22
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChunkContext(BaseModel):
|
|
26
|
+
metadata: dict
|
|
27
|
+
text: str
|
|
28
|
+
|
|
29
|
+
def serialize(self) -> str:
|
|
30
|
+
metadata_str = ", ".join([f"{k}: {v}" for k, v in self.metadata.items()])
|
|
31
|
+
return f"[{metadata_str}]\n{self.text}\n\n"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def format_search_results(search_results: List[SearchResult]) -> str:
|
|
35
|
+
results: List[ChunkContext] = []
|
|
36
|
+
for search_result in search_results:
|
|
37
|
+
results.append(
|
|
38
|
+
ChunkContext(
|
|
39
|
+
metadata={
|
|
40
|
+
"document_id": search_result.document_id,
|
|
41
|
+
"chunk_idx": search_result.chunk_idx,
|
|
42
|
+
},
|
|
43
|
+
text=search_result.chunk_text,
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
return "\n=========\n".join([result.serialize() for result in results])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class RagTool(KilnToolInterface):
|
|
50
|
+
"""
|
|
51
|
+
A tool that searches the vector store and returns the most relevant chunks.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, tool_id: str, rag_config: RagConfig):
|
|
55
|
+
self._id = tool_id
|
|
56
|
+
self._name = rag_config.tool_name
|
|
57
|
+
self._description = rag_config.tool_description
|
|
58
|
+
self._parameters_schema = {
|
|
59
|
+
"type": "object",
|
|
60
|
+
"properties": {
|
|
61
|
+
"query": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "The search query",
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
"required": ["query"],
|
|
67
|
+
}
|
|
68
|
+
self._rag_config = rag_config
|
|
69
|
+
vector_store_config = VectorStoreConfig.from_id_and_parent_path(
|
|
70
|
+
str(self._rag_config.vector_store_config_id), self.project.path
|
|
71
|
+
)
|
|
72
|
+
if vector_store_config is None:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Vector store config not found: {self._rag_config.vector_store_config_id}"
|
|
75
|
+
)
|
|
76
|
+
self._vector_store_config = vector_store_config
|
|
77
|
+
self._vector_store_adapter: BaseVectorStoreAdapter | None = None
|
|
78
|
+
|
|
79
|
+
@cached_property
|
|
80
|
+
def project(self) -> Project:
|
|
81
|
+
project = self._rag_config.parent_project()
|
|
82
|
+
if project is None:
|
|
83
|
+
raise ValueError(f"RAG config {self._rag_config.id} has no project")
|
|
84
|
+
return project
|
|
85
|
+
|
|
86
|
+
@cached_property
|
|
87
|
+
def embedding(
|
|
88
|
+
self,
|
|
89
|
+
) -> tuple[EmbeddingConfig, BaseEmbeddingAdapter]:
|
|
90
|
+
embedding_config = EmbeddingConfig.from_id_and_parent_path(
|
|
91
|
+
str(self._rag_config.embedding_config_id), self.project.path
|
|
92
|
+
)
|
|
93
|
+
if embedding_config is None:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Embedding config not found: {self._rag_config.embedding_config_id}"
|
|
96
|
+
)
|
|
97
|
+
return embedding_config, embedding_adapter_from_type(embedding_config)
|
|
98
|
+
|
|
99
|
+
async def vector_store(
|
|
100
|
+
self,
|
|
101
|
+
) -> BaseVectorStoreAdapter:
|
|
102
|
+
if self._vector_store_adapter is None:
|
|
103
|
+
self._vector_store_adapter = await vector_store_adapter_for_config(
|
|
104
|
+
vector_store_config=self._vector_store_config,
|
|
105
|
+
rag_config=self._rag_config,
|
|
106
|
+
)
|
|
107
|
+
return self._vector_store_adapter
|
|
108
|
+
|
|
109
|
+
async def id(self) -> ToolId:
|
|
110
|
+
return self._id
|
|
111
|
+
|
|
112
|
+
async def name(self) -> str:
|
|
113
|
+
return self._name
|
|
114
|
+
|
|
115
|
+
async def description(self) -> str:
|
|
116
|
+
return self._description
|
|
117
|
+
|
|
118
|
+
async def toolcall_definition(self) -> Dict[str, Any]:
|
|
119
|
+
"""Return the OpenAI-compatible tool definition for this tool."""
|
|
120
|
+
return {
|
|
121
|
+
"type": "function",
|
|
122
|
+
"function": {
|
|
123
|
+
"name": await self.name(),
|
|
124
|
+
"description": await self.description(),
|
|
125
|
+
"parameters": self._parameters_schema,
|
|
126
|
+
},
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async def run(self, query: str) -> str:
|
|
130
|
+
_, embedding_adapter = self.embedding
|
|
131
|
+
|
|
132
|
+
vector_store_adapter = await self.vector_store()
|
|
133
|
+
store_query = VectorStoreQuery(
|
|
134
|
+
query_embedding=None,
|
|
135
|
+
query_string=query,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
match self._vector_store_config.store_type:
|
|
139
|
+
case VectorStoreType.LANCE_DB_HYBRID | VectorStoreType.LANCE_DB_VECTOR:
|
|
140
|
+
is_vector_query = True
|
|
141
|
+
case VectorStoreType.LANCE_DB_FTS:
|
|
142
|
+
is_vector_query = False
|
|
143
|
+
case _:
|
|
144
|
+
raise_exhaustive_enum_error(self._vector_store_config.store_type)
|
|
145
|
+
|
|
146
|
+
if is_vector_query:
|
|
147
|
+
query_embedding_result = await embedding_adapter.generate_embeddings(
|
|
148
|
+
[query]
|
|
149
|
+
)
|
|
150
|
+
if len(query_embedding_result.embeddings) == 0:
|
|
151
|
+
raise ValueError("No embeddings generated")
|
|
152
|
+
store_query.query_embedding = query_embedding_result.embeddings[0].vector
|
|
153
|
+
|
|
154
|
+
search_results = await vector_store_adapter.search(store_query)
|
|
155
|
+
context = format_search_results(search_results)
|
|
156
|
+
|
|
157
|
+
return context
|
|
@@ -364,7 +364,7 @@ class TestMCPSessionManager:
|
|
|
364
364
|
|
|
365
365
|
# Should extract the HTTP error from the nested structure
|
|
366
366
|
with pytest.raises(
|
|
367
|
-
ValueError, match="The MCP server rejected the request. Status 401"
|
|
367
|
+
ValueError, match=r"The MCP server rejected the request. Status 401"
|
|
368
368
|
):
|
|
369
369
|
async with manager.mcp_client(tool_server):
|
|
370
370
|
pass
|