kiln-ai 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (117) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  38. kiln_ai/adapters/ml_model_list.py +382 -4
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +7 -69
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +1 -1
  41. kiln_ai/adapters/model_adapters/test_structured_output.py +3 -1
  42. kiln_ai/adapters/ollama_tools.py +69 -12
  43. kiln_ai/adapters/provider_tools.py +190 -46
  44. kiln_ai/adapters/rag/deduplication.py +49 -0
  45. kiln_ai/adapters/rag/progress.py +252 -0
  46. kiln_ai/adapters/rag/rag_runners.py +844 -0
  47. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  48. kiln_ai/adapters/rag/test_progress.py +785 -0
  49. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  50. kiln_ai/adapters/remote_config.py +80 -8
  51. kiln_ai/adapters/test_adapter_registry.py +579 -86
  52. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  53. kiln_ai/adapters/test_ml_model_list.py +212 -0
  54. kiln_ai/adapters/test_ollama_tools.py +340 -1
  55. kiln_ai/adapters/test_prompt_builders.py +1 -1
  56. kiln_ai/adapters/test_provider_tools.py +199 -8
  57. kiln_ai/adapters/test_remote_config.py +551 -56
  58. kiln_ai/adapters/vector_store/__init__.py +1 -0
  59. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  60. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  61. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  62. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  63. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  64. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  65. kiln_ai/datamodel/__init__.py +16 -13
  66. kiln_ai/datamodel/basemodel.py +170 -1
  67. kiln_ai/datamodel/chunk.py +158 -0
  68. kiln_ai/datamodel/datamodel_enums.py +27 -0
  69. kiln_ai/datamodel/embedding.py +64 -0
  70. kiln_ai/datamodel/extraction.py +303 -0
  71. kiln_ai/datamodel/project.py +33 -1
  72. kiln_ai/datamodel/rag.py +79 -0
  73. kiln_ai/datamodel/test_attachment.py +649 -0
  74. kiln_ai/datamodel/test_basemodel.py +1 -1
  75. kiln_ai/datamodel/test_chunk_models.py +317 -0
  76. kiln_ai/datamodel/test_dataset_split.py +1 -1
  77. kiln_ai/datamodel/test_embedding_models.py +448 -0
  78. kiln_ai/datamodel/test_eval_model.py +6 -6
  79. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  80. kiln_ai/datamodel/test_extraction_model.py +470 -0
  81. kiln_ai/datamodel/test_rag.py +641 -0
  82. kiln_ai/datamodel/test_tool_id.py +81 -0
  83. kiln_ai/datamodel/test_vector_store.py +320 -0
  84. kiln_ai/datamodel/tool_id.py +22 -0
  85. kiln_ai/datamodel/vector_store.py +141 -0
  86. kiln_ai/tools/mcp_session_manager.py +4 -1
  87. kiln_ai/tools/rag_tools.py +157 -0
  88. kiln_ai/tools/test_mcp_session_manager.py +1 -1
  89. kiln_ai/tools/test_rag_tools.py +848 -0
  90. kiln_ai/tools/test_tool_registry.py +91 -2
  91. kiln_ai/tools/tool_registry.py +21 -0
  92. kiln_ai/utils/__init__.py +3 -0
  93. kiln_ai/utils/async_job_runner.py +62 -17
  94. kiln_ai/utils/config.py +2 -2
  95. kiln_ai/utils/env.py +15 -0
  96. kiln_ai/utils/filesystem.py +14 -0
  97. kiln_ai/utils/filesystem_cache.py +60 -0
  98. kiln_ai/utils/litellm.py +94 -0
  99. kiln_ai/utils/lock.py +100 -0
  100. kiln_ai/utils/mime_type.py +38 -0
  101. kiln_ai/utils/pdf_utils.py +38 -0
  102. kiln_ai/utils/test_async_job_runner.py +151 -35
  103. kiln_ai/utils/test_env.py +142 -0
  104. kiln_ai/utils/test_filesystem_cache.py +316 -0
  105. kiln_ai/utils/test_litellm.py +206 -0
  106. kiln_ai/utils/test_lock.py +185 -0
  107. kiln_ai/utils/test_mime_type.py +66 -0
  108. kiln_ai/utils/test_pdf_utils.py +73 -0
  109. kiln_ai/utils/test_uuid.py +111 -0
  110. kiln_ai/utils/test_validation.py +524 -0
  111. kiln_ai/utils/uuid.py +9 -0
  112. kiln_ai/utils/validation.py +90 -0
  113. {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +7 -1
  114. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  115. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  116. {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  117. {kiln_ai-0.20.1.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,157 @@
1
+ from functools import cached_property
2
+ from typing import Any, Dict, List
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from kiln_ai.adapters.embedding.base_embedding_adapter import BaseEmbeddingAdapter
7
+ from kiln_ai.adapters.embedding.embedding_registry import embedding_adapter_from_type
8
+ from kiln_ai.adapters.vector_store.base_vector_store_adapter import (
9
+ BaseVectorStoreAdapter,
10
+ SearchResult,
11
+ VectorStoreQuery,
12
+ )
13
+ from kiln_ai.adapters.vector_store.vector_store_registry import (
14
+ vector_store_adapter_for_config,
15
+ )
16
+ from kiln_ai.datamodel.embedding import EmbeddingConfig
17
+ from kiln_ai.datamodel.project import Project
18
+ from kiln_ai.datamodel.rag import RagConfig
19
+ from kiln_ai.datamodel.tool_id import ToolId
20
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig, VectorStoreType
21
+ from kiln_ai.tools.base_tool import KilnToolInterface
22
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
23
+
24
+
25
+ class ChunkContext(BaseModel):
26
+ metadata: dict
27
+ text: str
28
+
29
+ def serialize(self) -> str:
30
+ metadata_str = ", ".join([f"{k}: {v}" for k, v in self.metadata.items()])
31
+ return f"[{metadata_str}]\n{self.text}\n\n"
32
+
33
+
34
+ def format_search_results(search_results: List[SearchResult]) -> str:
35
+ results: List[ChunkContext] = []
36
+ for search_result in search_results:
37
+ results.append(
38
+ ChunkContext(
39
+ metadata={
40
+ "document_id": search_result.document_id,
41
+ "chunk_idx": search_result.chunk_idx,
42
+ },
43
+ text=search_result.chunk_text,
44
+ )
45
+ )
46
+ return "\n=========\n".join([result.serialize() for result in results])
47
+
48
+
49
+ class RagTool(KilnToolInterface):
50
+ """
51
+ A tool that searches the vector store and returns the most relevant chunks.
52
+ """
53
+
54
+ def __init__(self, tool_id: str, rag_config: RagConfig):
55
+ self._id = tool_id
56
+ self._name = rag_config.tool_name
57
+ self._description = rag_config.tool_description
58
+ self._parameters_schema = {
59
+ "type": "object",
60
+ "properties": {
61
+ "query": {
62
+ "type": "string",
63
+ "description": "The search query",
64
+ },
65
+ },
66
+ "required": ["query"],
67
+ }
68
+ self._rag_config = rag_config
69
+ vector_store_config = VectorStoreConfig.from_id_and_parent_path(
70
+ str(self._rag_config.vector_store_config_id), self.project.path
71
+ )
72
+ if vector_store_config is None:
73
+ raise ValueError(
74
+ f"Vector store config not found: {self._rag_config.vector_store_config_id}"
75
+ )
76
+ self._vector_store_config = vector_store_config
77
+ self._vector_store_adapter: BaseVectorStoreAdapter | None = None
78
+
79
+ @cached_property
80
+ def project(self) -> Project:
81
+ project = self._rag_config.parent_project()
82
+ if project is None:
83
+ raise ValueError(f"RAG config {self._rag_config.id} has no project")
84
+ return project
85
+
86
+ @cached_property
87
+ def embedding(
88
+ self,
89
+ ) -> tuple[EmbeddingConfig, BaseEmbeddingAdapter]:
90
+ embedding_config = EmbeddingConfig.from_id_and_parent_path(
91
+ str(self._rag_config.embedding_config_id), self.project.path
92
+ )
93
+ if embedding_config is None:
94
+ raise ValueError(
95
+ f"Embedding config not found: {self._rag_config.embedding_config_id}"
96
+ )
97
+ return embedding_config, embedding_adapter_from_type(embedding_config)
98
+
99
+ async def vector_store(
100
+ self,
101
+ ) -> BaseVectorStoreAdapter:
102
+ if self._vector_store_adapter is None:
103
+ self._vector_store_adapter = await vector_store_adapter_for_config(
104
+ vector_store_config=self._vector_store_config,
105
+ rag_config=self._rag_config,
106
+ )
107
+ return self._vector_store_adapter
108
+
109
+ async def id(self) -> ToolId:
110
+ return self._id
111
+
112
+ async def name(self) -> str:
113
+ return self._name
114
+
115
+ async def description(self) -> str:
116
+ return self._description
117
+
118
+ async def toolcall_definition(self) -> Dict[str, Any]:
119
+ """Return the OpenAI-compatible tool definition for this tool."""
120
+ return {
121
+ "type": "function",
122
+ "function": {
123
+ "name": await self.name(),
124
+ "description": await self.description(),
125
+ "parameters": self._parameters_schema,
126
+ },
127
+ }
128
+
129
+ async def run(self, query: str) -> str:
130
+ _, embedding_adapter = self.embedding
131
+
132
+ vector_store_adapter = await self.vector_store()
133
+ store_query = VectorStoreQuery(
134
+ query_embedding=None,
135
+ query_string=query,
136
+ )
137
+
138
+ match self._vector_store_config.store_type:
139
+ case VectorStoreType.LANCE_DB_HYBRID | VectorStoreType.LANCE_DB_VECTOR:
140
+ is_vector_query = True
141
+ case VectorStoreType.LANCE_DB_FTS:
142
+ is_vector_query = False
143
+ case _:
144
+ raise_exhaustive_enum_error(self._vector_store_config.store_type)
145
+
146
+ if is_vector_query:
147
+ query_embedding_result = await embedding_adapter.generate_embeddings(
148
+ [query]
149
+ )
150
+ if len(query_embedding_result.embeddings) == 0:
151
+ raise ValueError("No embeddings generated")
152
+ store_query.query_embedding = query_embedding_result.embeddings[0].vector
153
+
154
+ search_results = await vector_store_adapter.search(store_query)
155
+ context = format_search_results(search_results)
156
+
157
+ return context
@@ -364,7 +364,7 @@ class TestMCPSessionManager:
364
364
 
365
365
  # Should extract the HTTP error from the nested structure
366
366
  with pytest.raises(
367
- ValueError, match="The MCP server rejected the request. Status 401"
367
+ ValueError, match=r"The MCP server rejected the request. Status 401"
368
368
  ):
369
369
  async with manager.mcp_client(tool_server):
370
370
  pass