kiln-ai 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (158) hide show
  1. kiln_ai/adapters/__init__.py +8 -2
  2. kiln_ai/adapters/adapter_registry.py +43 -208
  3. kiln_ai/adapters/chat/chat_formatter.py +8 -12
  4. kiln_ai/adapters/chat/test_chat_formatter.py +6 -2
  5. kiln_ai/adapters/chunkers/__init__.py +13 -0
  6. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  7. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  8. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  9. kiln_ai/adapters/chunkers/helpers.py +23 -0
  10. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  11. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  12. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  13. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  14. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  15. kiln_ai/adapters/docker_model_runner_tools.py +119 -0
  16. kiln_ai/adapters/embedding/__init__.py +0 -0
  17. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  18. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  19. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  20. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  21. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  22. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  23. kiln_ai/adapters/eval/base_eval.py +2 -2
  24. kiln_ai/adapters/eval/eval_runner.py +9 -3
  25. kiln_ai/adapters/eval/g_eval.py +2 -2
  26. kiln_ai/adapters/eval/test_base_eval.py +2 -4
  27. kiln_ai/adapters/eval/test_g_eval.py +4 -5
  28. kiln_ai/adapters/extractors/__init__.py +18 -0
  29. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  30. kiln_ai/adapters/extractors/encoding.py +20 -0
  31. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  32. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  33. kiln_ai/adapters/extractors/litellm_extractor.py +386 -0
  34. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  35. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  36. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  37. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  38. kiln_ai/adapters/extractors/test_litellm_extractor.py +1192 -0
  39. kiln_ai/adapters/fine_tune/__init__.py +1 -1
  40. kiln_ai/adapters/fine_tune/openai_finetune.py +14 -4
  41. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  42. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  43. kiln_ai/adapters/fine_tune/test_openai_finetune.py +108 -111
  44. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  45. kiln_ai/adapters/ml_embedding_model_list.py +192 -0
  46. kiln_ai/adapters/ml_model_list.py +761 -37
  47. kiln_ai/adapters/model_adapters/base_adapter.py +51 -21
  48. kiln_ai/adapters/model_adapters/litellm_adapter.py +380 -138
  49. kiln_ai/adapters/model_adapters/test_base_adapter.py +193 -17
  50. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -2
  51. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +1103 -0
  52. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +5 -5
  53. kiln_ai/adapters/model_adapters/test_structured_output.py +113 -5
  54. kiln_ai/adapters/ollama_tools.py +69 -12
  55. kiln_ai/adapters/parsers/__init__.py +1 -1
  56. kiln_ai/adapters/provider_tools.py +205 -47
  57. kiln_ai/adapters/rag/deduplication.py +49 -0
  58. kiln_ai/adapters/rag/progress.py +252 -0
  59. kiln_ai/adapters/rag/rag_runners.py +844 -0
  60. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  61. kiln_ai/adapters/rag/test_progress.py +785 -0
  62. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  63. kiln_ai/adapters/remote_config.py +80 -8
  64. kiln_ai/adapters/repair/test_repair_task.py +12 -9
  65. kiln_ai/adapters/run_output.py +3 -0
  66. kiln_ai/adapters/test_adapter_registry.py +657 -85
  67. kiln_ai/adapters/test_docker_model_runner_tools.py +305 -0
  68. kiln_ai/adapters/test_ml_embedding_model_list.py +429 -0
  69. kiln_ai/adapters/test_ml_model_list.py +251 -1
  70. kiln_ai/adapters/test_ollama_tools.py +340 -1
  71. kiln_ai/adapters/test_prompt_adaptors.py +13 -6
  72. kiln_ai/adapters/test_prompt_builders.py +1 -1
  73. kiln_ai/adapters/test_provider_tools.py +254 -8
  74. kiln_ai/adapters/test_remote_config.py +651 -58
  75. kiln_ai/adapters/vector_store/__init__.py +1 -0
  76. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  77. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  78. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  79. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  80. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  81. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  82. kiln_ai/datamodel/__init__.py +39 -34
  83. kiln_ai/datamodel/basemodel.py +170 -1
  84. kiln_ai/datamodel/chunk.py +158 -0
  85. kiln_ai/datamodel/datamodel_enums.py +28 -0
  86. kiln_ai/datamodel/embedding.py +64 -0
  87. kiln_ai/datamodel/eval.py +1 -1
  88. kiln_ai/datamodel/external_tool_server.py +298 -0
  89. kiln_ai/datamodel/extraction.py +303 -0
  90. kiln_ai/datamodel/json_schema.py +25 -10
  91. kiln_ai/datamodel/project.py +40 -1
  92. kiln_ai/datamodel/rag.py +79 -0
  93. kiln_ai/datamodel/registry.py +0 -15
  94. kiln_ai/datamodel/run_config.py +62 -0
  95. kiln_ai/datamodel/task.py +2 -77
  96. kiln_ai/datamodel/task_output.py +6 -1
  97. kiln_ai/datamodel/task_run.py +41 -0
  98. kiln_ai/datamodel/test_attachment.py +649 -0
  99. kiln_ai/datamodel/test_basemodel.py +4 -4
  100. kiln_ai/datamodel/test_chunk_models.py +317 -0
  101. kiln_ai/datamodel/test_dataset_split.py +1 -1
  102. kiln_ai/datamodel/test_embedding_models.py +448 -0
  103. kiln_ai/datamodel/test_eval_model.py +6 -6
  104. kiln_ai/datamodel/test_example_models.py +175 -0
  105. kiln_ai/datamodel/test_external_tool_server.py +691 -0
  106. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  107. kiln_ai/datamodel/test_extraction_model.py +470 -0
  108. kiln_ai/datamodel/test_rag.py +641 -0
  109. kiln_ai/datamodel/test_registry.py +8 -3
  110. kiln_ai/datamodel/test_task.py +15 -47
  111. kiln_ai/datamodel/test_tool_id.py +320 -0
  112. kiln_ai/datamodel/test_vector_store.py +320 -0
  113. kiln_ai/datamodel/tool_id.py +105 -0
  114. kiln_ai/datamodel/vector_store.py +141 -0
  115. kiln_ai/tools/__init__.py +8 -0
  116. kiln_ai/tools/base_tool.py +82 -0
  117. kiln_ai/tools/built_in_tools/__init__.py +13 -0
  118. kiln_ai/tools/built_in_tools/math_tools.py +124 -0
  119. kiln_ai/tools/built_in_tools/test_math_tools.py +204 -0
  120. kiln_ai/tools/mcp_server_tool.py +95 -0
  121. kiln_ai/tools/mcp_session_manager.py +246 -0
  122. kiln_ai/tools/rag_tools.py +157 -0
  123. kiln_ai/tools/test_base_tools.py +199 -0
  124. kiln_ai/tools/test_mcp_server_tool.py +457 -0
  125. kiln_ai/tools/test_mcp_session_manager.py +1585 -0
  126. kiln_ai/tools/test_rag_tools.py +848 -0
  127. kiln_ai/tools/test_tool_registry.py +562 -0
  128. kiln_ai/tools/tool_registry.py +85 -0
  129. kiln_ai/utils/__init__.py +3 -0
  130. kiln_ai/utils/async_job_runner.py +62 -17
  131. kiln_ai/utils/config.py +24 -2
  132. kiln_ai/utils/env.py +15 -0
  133. kiln_ai/utils/filesystem.py +14 -0
  134. kiln_ai/utils/filesystem_cache.py +60 -0
  135. kiln_ai/utils/litellm.py +94 -0
  136. kiln_ai/utils/lock.py +100 -0
  137. kiln_ai/utils/mime_type.py +38 -0
  138. kiln_ai/utils/open_ai_types.py +94 -0
  139. kiln_ai/utils/pdf_utils.py +38 -0
  140. kiln_ai/utils/project_utils.py +17 -0
  141. kiln_ai/utils/test_async_job_runner.py +151 -35
  142. kiln_ai/utils/test_config.py +138 -1
  143. kiln_ai/utils/test_env.py +142 -0
  144. kiln_ai/utils/test_filesystem_cache.py +316 -0
  145. kiln_ai/utils/test_litellm.py +206 -0
  146. kiln_ai/utils/test_lock.py +185 -0
  147. kiln_ai/utils/test_mime_type.py +66 -0
  148. kiln_ai/utils/test_open_ai_types.py +131 -0
  149. kiln_ai/utils/test_pdf_utils.py +73 -0
  150. kiln_ai/utils/test_uuid.py +111 -0
  151. kiln_ai/utils/test_validation.py +524 -0
  152. kiln_ai/utils/uuid.py +9 -0
  153. kiln_ai/utils/validation.py +90 -0
  154. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/METADATA +12 -5
  155. kiln_ai-0.21.0.dist-info/RECORD +211 -0
  156. kiln_ai-0.19.0.dist-info/RECORD +0 -115
  157. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/WHEEL +0 -0
  158. {kiln_ai-0.19.0.dist-info → kiln_ai-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,252 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import Dict, Literal
4
+
5
+ from kiln_ai.adapters.rag.deduplication import (
6
+ deduplicate_chunk_embeddings,
7
+ deduplicate_chunked_documents,
8
+ deduplicate_extractions,
9
+ filter_documents_by_tags,
10
+ )
11
+ from kiln_ai.adapters.vector_store.vector_store_registry import (
12
+ vector_store_adapter_for_config,
13
+ )
14
+ from kiln_ai.datamodel.project import Project
15
+ from kiln_ai.datamodel.rag import RagConfig
16
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig
17
+ from pydantic import BaseModel, Field
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class LogMessage(BaseModel):
23
+ level: Literal["info", "error", "warning"] = Field(
24
+ description="The level of the log message",
25
+ )
26
+ message: str = Field(
27
+ description="The message to display to the user",
28
+ )
29
+
30
+
31
+ class RagProgress(BaseModel):
32
+ total_document_count: int = Field(
33
+ description="The total number of items to process",
34
+ default=0,
35
+ )
36
+
37
+ total_document_completed_count: int = Field(
38
+ description="The number of items that have been processed",
39
+ default=0,
40
+ )
41
+
42
+ # Progress for indexing is tracked in terms of chunks, not documents. After the initial run
43
+ # the only info we have is how many chunks are in the vector store, and so we need to know
44
+ # the total number of chunks that should be indexed to know if it is completed or not.
45
+ # So we need toset and send that through to the client once we know it (after completing chunking).
46
+ total_chunk_count: int = Field(
47
+ description="The number of chunks that should be indexed for the indexing to be completed.",
48
+ default=0,
49
+ )
50
+
51
+ total_chunk_completed_count: int = Field(
52
+ description="The number of chunks that have been indexed",
53
+ default=0,
54
+ )
55
+
56
+ total_document_extracted_count: int = Field(
57
+ description="The number of items that have been extracted",
58
+ default=0,
59
+ )
60
+
61
+ total_document_extracted_error_count: int = Field(
62
+ description="The number of items that have errored during extraction",
63
+ default=0,
64
+ )
65
+
66
+ total_document_chunked_count: int = Field(
67
+ description="The number of items that have been chunked",
68
+ default=0,
69
+ )
70
+
71
+ total_document_chunked_error_count: int = Field(
72
+ description="The number of items that have errored during chunking",
73
+ default=0,
74
+ )
75
+
76
+ total_document_embedded_count: int = Field(
77
+ description="The number of items that have been embedded",
78
+ default=0,
79
+ )
80
+
81
+ total_document_embedded_error_count: int = Field(
82
+ description="The number of items that have errored during embedding",
83
+ default=0,
84
+ )
85
+
86
+ total_chunks_indexed_count: int = Field(
87
+ description="The number of chunks that have been indexed",
88
+ default=0,
89
+ )
90
+
91
+ total_chunks_indexed_error_count: int = Field(
92
+ description="The number of chunks that have errored during indexing",
93
+ default=0,
94
+ )
95
+
96
+ logs: list[LogMessage] | None = Field(
97
+ description="A list of log messages to display to the user",
98
+ default=None,
99
+ )
100
+
101
+
102
+ async def count_records_in_vector_store(
103
+ rag_config: RagConfig,
104
+ vector_store_config: VectorStoreConfig,
105
+ ) -> int:
106
+ vector_store = await vector_store_adapter_for_config(
107
+ rag_config, vector_store_config
108
+ )
109
+ count = await vector_store.count_records()
110
+ return count
111
+
112
+
113
+ async def count_records_in_vector_store_for_rag_config(
114
+ project: Project,
115
+ rag_config: RagConfig,
116
+ ) -> int:
117
+ vector_store_config = VectorStoreConfig.from_id_and_parent_path(
118
+ str(rag_config.vector_store_config_id),
119
+ project.path,
120
+ )
121
+ if vector_store_config is None:
122
+ raise ValueError(f"Rag config {rag_config.id} has no vector store config")
123
+ return await count_records_in_vector_store(rag_config, vector_store_config)
124
+
125
+
126
+ async def compute_current_progress_for_rag_configs(
127
+ project: Project,
128
+ rag_configs: list[RagConfig],
129
+ ) -> Dict[str, RagProgress]:
130
+ # each RAG config represents a unique path: extractor::chunker::embedding
131
+ # different configs can share common prefixes
132
+ # (e.g., extractor-1::chunker-2 for both extractor-1::chunker-2::embedding-3 and extractor-1::chunker-2::embedding-4)
133
+ # we store prefix -> [rag config ids] mappings so at every step
134
+ # we know all the configs that share the same upstream steps
135
+ path_prefixes: dict[str, set[str]] = defaultdict(set)
136
+ for rag_config in rag_configs:
137
+ complete_path: list[str] = [
138
+ str(rag_config.extractor_config_id),
139
+ str(rag_config.chunker_config_id),
140
+ str(rag_config.embedding_config_id),
141
+ ]
142
+ for i in range(len(complete_path)):
143
+ prefix = "::".join(complete_path[: i + 1])
144
+ path_prefixes[prefix].add(str(rag_config.id))
145
+
146
+ rag_config_progress_map: dict[str, RagProgress] = {}
147
+ for rag_config in rag_configs:
148
+ all_documents = project.documents(readonly=True)
149
+ if rag_config.tags:
150
+ all_documents = filter_documents_by_tags(all_documents, rag_config.tags)
151
+
152
+ rag_config_progress_map[str(rag_config.id)] = RagProgress(
153
+ total_document_count=len(all_documents),
154
+ total_document_completed_count=0,
155
+ total_chunk_count=0,
156
+ total_chunk_completed_count=0,
157
+ total_document_extracted_count=0,
158
+ total_document_chunked_count=0,
159
+ total_document_embedded_count=0,
160
+ total_chunks_indexed_count=await count_records_in_vector_store_for_rag_config(
161
+ project, rag_config
162
+ ),
163
+ )
164
+
165
+ # Create a mapping of rag_config_id to its tags for efficient lookup
166
+ rag_config_tags_map = {
167
+ str(rag_config.id): rag_config.tags for rag_config in rag_configs
168
+ }
169
+
170
+ for document in project.documents(readonly=True):
171
+ for extraction in deduplicate_extractions(document.extractions(readonly=True)):
172
+ extraction_path_prefix = str(extraction.extractor_config_id)
173
+
174
+ # increment the extraction count for every rag config that has this extractor
175
+ # and includes this document based on its tags
176
+ for matching_rag_config_id in path_prefixes[extraction_path_prefix]:
177
+ rag_config_tags = rag_config_tags_map[matching_rag_config_id]
178
+ if not rag_config_tags or (
179
+ document.tags
180
+ and any(tag in document.tags for tag in rag_config_tags)
181
+ ):
182
+ rag_config_progress_map[
183
+ matching_rag_config_id
184
+ ].total_document_extracted_count += 1
185
+
186
+ for chunked_document in deduplicate_chunked_documents(
187
+ extraction.chunked_documents(readonly=True)
188
+ ):
189
+ # increment the chunked count for every rag config that has this extractor+chunker combo
190
+ # and includes this document based on its tags
191
+ chunking_path_prefix = (
192
+ f"{extraction_path_prefix}::{chunked_document.chunker_config_id}"
193
+ )
194
+ for matching_rag_config_id in path_prefixes[chunking_path_prefix]:
195
+ rag_config_tags = rag_config_tags_map[matching_rag_config_id]
196
+ if not rag_config_tags or (
197
+ document.tags
198
+ and any(tag in document.tags for tag in rag_config_tags)
199
+ ):
200
+ rag_config_progress_map[
201
+ matching_rag_config_id
202
+ ].total_document_chunked_count += 1
203
+
204
+ rag_config_progress_map[
205
+ matching_rag_config_id
206
+ ].total_chunk_count += len(chunked_document.chunks)
207
+
208
+ for embedding in deduplicate_chunk_embeddings(
209
+ chunked_document.chunk_embeddings(readonly=True)
210
+ ):
211
+ # increment the embedding count for every rag config that has this extractor+chunker+embedding combo
212
+ # and includes this document based on its tags
213
+ embedding_path_prefix = (
214
+ f"{chunking_path_prefix}::{embedding.embedding_config_id}"
215
+ )
216
+
217
+ for matching_rag_config_id in path_prefixes[embedding_path_prefix]:
218
+ rag_config_tags = rag_config_tags_map[matching_rag_config_id]
219
+ if not rag_config_tags or (
220
+ document.tags
221
+ and any(tag in document.tags for tag in rag_config_tags)
222
+ ):
223
+ rag_config_progress_map[
224
+ matching_rag_config_id
225
+ ].total_document_embedded_count += 1
226
+
227
+ # a document is completed only when all steps are completed, so overall progress is the same
228
+ # as the least complete step
229
+ for _, rag_config_progress in rag_config_progress_map.items():
230
+ rag_config_progress.total_document_completed_count = min(
231
+ rag_config_progress.total_document_extracted_count,
232
+ rag_config_progress.total_document_chunked_count,
233
+ rag_config_progress.total_document_embedded_count,
234
+ )
235
+
236
+ rag_config_progress.total_chunk_completed_count = (
237
+ rag_config_progress.total_chunks_indexed_count
238
+ )
239
+
240
+ return dict(rag_config_progress_map)
241
+
242
+
243
+ async def compute_current_progress_for_rag_config(
244
+ project: Project,
245
+ rag_config: RagConfig,
246
+ ) -> RagProgress:
247
+ config_progress = await compute_current_progress_for_rag_configs(
248
+ project, [rag_config]
249
+ )
250
+ if str(rag_config.id) not in config_progress:
251
+ raise ValueError(f"Failed to compute progress for rag config {rag_config.id}")
252
+ return config_progress[str(rag_config.id)]