kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (133) hide show
  1. kiln_ai/adapters/__init__.py +6 -0
  2. kiln_ai/adapters/adapter_registry.py +43 -226
  3. kiln_ai/adapters/chunkers/__init__.py +13 -0
  4. kiln_ai/adapters/chunkers/base_chunker.py +42 -0
  5. kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
  6. kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
  7. kiln_ai/adapters/chunkers/helpers.py +23 -0
  8. kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
  9. kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
  10. kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
  11. kiln_ai/adapters/chunkers/test_helpers.py +75 -0
  12. kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
  13. kiln_ai/adapters/embedding/__init__.py +0 -0
  14. kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
  15. kiln_ai/adapters/embedding/embedding_registry.py +32 -0
  16. kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
  17. kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
  18. kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
  19. kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
  20. kiln_ai/adapters/eval/eval_runner.py +6 -2
  21. kiln_ai/adapters/eval/test_base_eval.py +1 -3
  22. kiln_ai/adapters/eval/test_g_eval.py +1 -1
  23. kiln_ai/adapters/extractors/__init__.py +18 -0
  24. kiln_ai/adapters/extractors/base_extractor.py +72 -0
  25. kiln_ai/adapters/extractors/encoding.py +20 -0
  26. kiln_ai/adapters/extractors/extractor_registry.py +44 -0
  27. kiln_ai/adapters/extractors/extractor_runner.py +112 -0
  28. kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
  29. kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
  30. kiln_ai/adapters/extractors/test_encoding.py +54 -0
  31. kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
  32. kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
  33. kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
  34. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
  35. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
  36. kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
  37. kiln_ai/adapters/ml_embedding_model_list.py +494 -0
  38. kiln_ai/adapters/ml_model_list.py +876 -18
  39. kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
  40. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
  41. kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
  42. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
  43. kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
  44. kiln_ai/adapters/ollama_tools.py +69 -12
  45. kiln_ai/adapters/provider_tools.py +190 -46
  46. kiln_ai/adapters/rag/deduplication.py +49 -0
  47. kiln_ai/adapters/rag/progress.py +252 -0
  48. kiln_ai/adapters/rag/rag_runners.py +844 -0
  49. kiln_ai/adapters/rag/test_deduplication.py +195 -0
  50. kiln_ai/adapters/rag/test_progress.py +785 -0
  51. kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
  52. kiln_ai/adapters/remote_config.py +80 -8
  53. kiln_ai/adapters/test_adapter_registry.py +579 -86
  54. kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
  55. kiln_ai/adapters/test_ml_model_list.py +202 -0
  56. kiln_ai/adapters/test_ollama_tools.py +340 -1
  57. kiln_ai/adapters/test_prompt_builders.py +1 -1
  58. kiln_ai/adapters/test_provider_tools.py +199 -8
  59. kiln_ai/adapters/test_remote_config.py +551 -56
  60. kiln_ai/adapters/vector_store/__init__.py +1 -0
  61. kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
  62. kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
  63. kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
  64. kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
  65. kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
  66. kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
  67. kiln_ai/datamodel/__init__.py +16 -13
  68. kiln_ai/datamodel/basemodel.py +201 -4
  69. kiln_ai/datamodel/chunk.py +158 -0
  70. kiln_ai/datamodel/datamodel_enums.py +27 -0
  71. kiln_ai/datamodel/embedding.py +64 -0
  72. kiln_ai/datamodel/external_tool_server.py +206 -54
  73. kiln_ai/datamodel/extraction.py +317 -0
  74. kiln_ai/datamodel/project.py +33 -1
  75. kiln_ai/datamodel/rag.py +79 -0
  76. kiln_ai/datamodel/task.py +5 -0
  77. kiln_ai/datamodel/task_output.py +41 -11
  78. kiln_ai/datamodel/test_attachment.py +649 -0
  79. kiln_ai/datamodel/test_basemodel.py +270 -14
  80. kiln_ai/datamodel/test_chunk_models.py +317 -0
  81. kiln_ai/datamodel/test_dataset_split.py +1 -1
  82. kiln_ai/datamodel/test_datasource.py +50 -0
  83. kiln_ai/datamodel/test_embedding_models.py +448 -0
  84. kiln_ai/datamodel/test_eval_model.py +6 -6
  85. kiln_ai/datamodel/test_external_tool_server.py +534 -152
  86. kiln_ai/datamodel/test_extraction_chunk.py +206 -0
  87. kiln_ai/datamodel/test_extraction_model.py +501 -0
  88. kiln_ai/datamodel/test_rag.py +641 -0
  89. kiln_ai/datamodel/test_task.py +35 -1
  90. kiln_ai/datamodel/test_tool_id.py +187 -1
  91. kiln_ai/datamodel/test_vector_store.py +320 -0
  92. kiln_ai/datamodel/tool_id.py +58 -0
  93. kiln_ai/datamodel/vector_store.py +141 -0
  94. kiln_ai/tools/base_tool.py +12 -3
  95. kiln_ai/tools/built_in_tools/math_tools.py +12 -4
  96. kiln_ai/tools/kiln_task_tool.py +158 -0
  97. kiln_ai/tools/mcp_server_tool.py +2 -2
  98. kiln_ai/tools/mcp_session_manager.py +51 -22
  99. kiln_ai/tools/rag_tools.py +164 -0
  100. kiln_ai/tools/test_kiln_task_tool.py +527 -0
  101. kiln_ai/tools/test_mcp_server_tool.py +4 -15
  102. kiln_ai/tools/test_mcp_session_manager.py +187 -227
  103. kiln_ai/tools/test_rag_tools.py +929 -0
  104. kiln_ai/tools/test_tool_registry.py +290 -7
  105. kiln_ai/tools/tool_registry.py +69 -16
  106. kiln_ai/utils/__init__.py +3 -0
  107. kiln_ai/utils/async_job_runner.py +62 -17
  108. kiln_ai/utils/config.py +2 -2
  109. kiln_ai/utils/env.py +15 -0
  110. kiln_ai/utils/filesystem.py +14 -0
  111. kiln_ai/utils/filesystem_cache.py +60 -0
  112. kiln_ai/utils/litellm.py +94 -0
  113. kiln_ai/utils/lock.py +100 -0
  114. kiln_ai/utils/mime_type.py +38 -0
  115. kiln_ai/utils/open_ai_types.py +19 -2
  116. kiln_ai/utils/pdf_utils.py +59 -0
  117. kiln_ai/utils/test_async_job_runner.py +151 -35
  118. kiln_ai/utils/test_env.py +142 -0
  119. kiln_ai/utils/test_filesystem_cache.py +316 -0
  120. kiln_ai/utils/test_litellm.py +206 -0
  121. kiln_ai/utils/test_lock.py +185 -0
  122. kiln_ai/utils/test_mime_type.py +66 -0
  123. kiln_ai/utils/test_open_ai_types.py +88 -12
  124. kiln_ai/utils/test_pdf_utils.py +86 -0
  125. kiln_ai/utils/test_uuid.py +111 -0
  126. kiln_ai/utils/test_validation.py +524 -0
  127. kiln_ai/utils/uuid.py +9 -0
  128. kiln_ai/utils/validation.py +90 -0
  129. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
  130. kiln_ai-0.22.0.dist-info/RECORD +213 -0
  131. kiln_ai-0.20.1.dist-info/RECORD +0 -138
  132. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
  133. {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,317 @@
1
+ import logging
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Any, List, Union
4
+
5
+ import anyio
6
+ from pydantic import (
7
+ BaseModel,
8
+ Field,
9
+ SerializationInfo,
10
+ ValidationInfo,
11
+ computed_field,
12
+ field_serializer,
13
+ field_validator,
14
+ model_validator,
15
+ )
16
+ from typing_extensions import Self
17
+
18
+ from kiln_ai.datamodel.basemodel import (
19
+ ID_TYPE,
20
+ FilenameString,
21
+ KilnAttachmentModel,
22
+ KilnParentedModel,
23
+ KilnParentModel,
24
+ )
25
+ from kiln_ai.datamodel.chunk import ChunkedDocument
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ if TYPE_CHECKING:
30
+ from kiln_ai.datamodel.project import Project
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class Kind(str, Enum):
36
+ DOCUMENT = "document"
37
+ IMAGE = "image"
38
+ VIDEO = "video"
39
+ AUDIO = "audio"
40
+
41
+
42
+ class OutputFormat(str, Enum):
43
+ TEXT = "text/plain"
44
+ MARKDOWN = "text/markdown"
45
+
46
+
47
+ class ExtractorType(str, Enum):
48
+ LITELLM = "litellm"
49
+
50
+
51
+ SUPPORTED_MIME_TYPES = {
52
+ Kind.DOCUMENT: {
53
+ "application/pdf",
54
+ "text/plain",
55
+ "text/markdown",
56
+ "text/html",
57
+ "text/md",
58
+ },
59
+ Kind.IMAGE: {
60
+ "image/png",
61
+ "image/jpeg",
62
+ },
63
+ Kind.VIDEO: {
64
+ "video/mp4",
65
+ "video/quicktime",
66
+ },
67
+ Kind.AUDIO: {
68
+ "audio/wav",
69
+ "audio/mpeg",
70
+ "audio/ogg",
71
+ },
72
+ }
73
+
74
+
75
+ class ExtractionModel(BaseModel):
76
+ name: str
77
+ label: str
78
+
79
+
80
+ def validate_prompt(prompt: Any, name: str):
81
+ if not isinstance(prompt, str):
82
+ raise ValueError(f"{name} must be a string.")
83
+ if prompt == "":
84
+ raise ValueError(f"{name} cannot be empty.")
85
+
86
+
87
+ class ExtractionSource(str, Enum):
88
+ PROCESSED = "processed"
89
+ PASSTHROUGH = "passthrough"
90
+
91
+
92
+ class Extraction(
93
+ KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
94
+ ):
95
+ source: ExtractionSource = Field(
96
+ description="The source of the extraction.",
97
+ )
98
+ extractor_config_id: ID_TYPE = Field(
99
+ description="The ID of the extractor config used to extract the data.",
100
+ )
101
+ output: KilnAttachmentModel = Field(
102
+ description="The extraction output.",
103
+ )
104
+
105
+ def parent_document(self) -> Union["Document", None]:
106
+ if self.parent is None or self.parent.__class__.__name__ != "Document":
107
+ return None
108
+ return self.parent # type: ignore
109
+
110
+ async def output_content(self) -> str | None:
111
+ if not self.path:
112
+ raise ValueError(
113
+ "Failed to resolve the path of extraction output attachment because the extraction does not have a path."
114
+ )
115
+
116
+ full_path = self.output.resolve_path(self.path.parent)
117
+
118
+ try:
119
+ return await anyio.Path(full_path).read_text(encoding="utf-8")
120
+ except Exception as e:
121
+ logger.error(
122
+ f"Failed to read extraction output for {full_path}: {e}", exc_info=True
123
+ )
124
+ raise ValueError(f"Failed to read extraction output: {e}")
125
+
126
+ def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
127
+ return super().chunked_documents(readonly=readonly) # type: ignore
128
+
129
+
130
+ class ExtractorConfig(KilnParentedModel):
131
+ name: FilenameString = Field(
132
+ description="A name to identify the extractor config.",
133
+ )
134
+ is_archived: bool = Field(
135
+ default=False,
136
+ description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
137
+ )
138
+ description: str | None = Field(
139
+ default=None, description="The description of the extractor config"
140
+ )
141
+ model_provider_name: str = Field(
142
+ description="The name of the model provider to use for the extractor config.",
143
+ )
144
+ model_name: str = Field(
145
+ description="The name of the model to use for the extractor config.",
146
+ )
147
+ output_format: OutputFormat = Field(
148
+ default=OutputFormat.MARKDOWN,
149
+ description="The format to use for the output.",
150
+ )
151
+ passthrough_mimetypes: list[OutputFormat] = Field(
152
+ default_factory=list,
153
+ description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
154
+ )
155
+ extractor_type: ExtractorType = Field(
156
+ description="This is used to determine the type of extractor to use.",
157
+ )
158
+ properties: dict[str, str | int | float | bool | dict[str, str] | None] = Field(
159
+ default_factory=dict,
160
+ description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
161
+ )
162
+
163
+ @field_validator("properties")
164
+ @classmethod
165
+ def validate_properties(
166
+ cls, properties: dict[str, Any], info: ValidationInfo
167
+ ) -> dict[str, Any]:
168
+ def get_property(key: str) -> str:
169
+ value = properties.get(key)
170
+ if value is None or value == "" or not isinstance(value, str):
171
+ raise ValueError(f"Prompt for {key} must be a string")
172
+ return value
173
+
174
+ return {
175
+ "prompt_document": get_property(
176
+ "prompt_document",
177
+ ),
178
+ "prompt_image": get_property(
179
+ "prompt_image",
180
+ ),
181
+ "prompt_video": get_property(
182
+ "prompt_video",
183
+ ),
184
+ "prompt_audio": get_property(
185
+ "prompt_audio",
186
+ ),
187
+ }
188
+
189
+ def prompt_document(self) -> str | None:
190
+ prompt = self.properties.get("prompt_document")
191
+ if prompt is None:
192
+ return None
193
+ if not isinstance(prompt, str):
194
+ raise ValueError(
195
+ "Invalid prompt_document. prompt_document must be a string."
196
+ )
197
+ return prompt
198
+
199
+ def prompt_video(self) -> str | None:
200
+ prompt = self.properties.get("prompt_video")
201
+ if prompt is None:
202
+ return None
203
+ if not isinstance(prompt, str):
204
+ raise ValueError("Invalid prompt_video. prompt_video must be a string.")
205
+ return prompt
206
+
207
+ def prompt_audio(self) -> str | None:
208
+ prompt = self.properties.get("prompt_audio")
209
+ if prompt is None:
210
+ return None
211
+ if not isinstance(prompt, str):
212
+ raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
213
+ return prompt
214
+
215
+ def prompt_image(self) -> str | None:
216
+ prompt = self.properties.get("prompt_image")
217
+ if prompt is None:
218
+ return None
219
+ if not isinstance(prompt, str):
220
+ raise ValueError("Invalid prompt_image. prompt_image must be a string.")
221
+ return prompt
222
+
223
+ # Workaround to return typed parent without importing Project
224
+ def parent_project(self) -> Union["Project", None]:
225
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
226
+ return None
227
+ return self.parent # type: ignore
228
+
229
+
230
+ class FileInfo(BaseModel):
231
+ filename: str = Field(description="The filename of the file")
232
+
233
+ size: int = Field(description="The size of the file in bytes")
234
+
235
+ mime_type: str = Field(description="The MIME type of the file")
236
+
237
+ attachment: KilnAttachmentModel = Field(
238
+ description="The attachment to the file",
239
+ )
240
+
241
+ @field_serializer("attachment")
242
+ def serialize_attachment(
243
+ self, attachment: KilnAttachmentModel, info: SerializationInfo
244
+ ) -> dict:
245
+ context = info.context or {}
246
+ context["filename_prefix"] = "attachment"
247
+ return attachment.model_dump(mode="json", context=context)
248
+
249
+ @field_validator("mime_type")
250
+ @classmethod
251
+ def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
252
+ filename = info.data.get("filename") or ""
253
+
254
+ for mime_types in SUPPORTED_MIME_TYPES.values():
255
+ if mime_type in mime_types:
256
+ return mime_type
257
+ raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
258
+
259
+
260
+ class Document(
261
+ KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
262
+ ):
263
+ # this field should not be changed after creation
264
+ name: FilenameString = Field(
265
+ description="A name to identify the document.",
266
+ )
267
+
268
+ # this field can be changed after creation
269
+ name_override: str | None = Field(
270
+ description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
271
+ default=None,
272
+ )
273
+
274
+ description: str = Field(description="A description for the file")
275
+
276
+ original_file: FileInfo = Field(description="The original file")
277
+
278
+ kind: Kind = Field(
279
+ description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
280
+ )
281
+
282
+ tags: List[str] = Field(
283
+ default_factory=list,
284
+ description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
285
+ )
286
+
287
+ @model_validator(mode="after")
288
+ def validate_tags(self) -> Self:
289
+ for tag in self.tags:
290
+ if not tag:
291
+ raise ValueError("Tags cannot be empty strings")
292
+ if " " in tag:
293
+ raise ValueError("Tags cannot contain spaces. Try underscores.")
294
+
295
+ return self
296
+
297
+ # Workaround to return typed parent without importing Project
298
+ def parent_project(self) -> Union["Project", None]:
299
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
300
+ return None
301
+ return self.parent # type: ignore
302
+
303
+ def extractions(self, readonly: bool = False) -> list[Extraction]:
304
+ return super().extractions(readonly=readonly) # type: ignore
305
+
306
+ @computed_field
307
+ @property
308
+ def friendly_name(self) -> str:
309
+ # backward compatibility: old documents did not have name_override
310
+ return self.name_override or self.name
311
+
312
+
313
+ def get_kind_from_mime_type(mime_type: str) -> Kind | None:
314
+ for kind, mime_types in SUPPORTED_MIME_TYPES.items():
315
+ if mime_type in mime_types:
316
+ return kind
317
+ return None
@@ -1,13 +1,27 @@
1
1
  from pydantic import Field
2
2
 
3
3
  from kiln_ai.datamodel.basemodel import FilenameString, KilnParentModel
4
+ from kiln_ai.datamodel.chunk import ChunkerConfig
5
+ from kiln_ai.datamodel.embedding import EmbeddingConfig
4
6
  from kiln_ai.datamodel.external_tool_server import ExternalToolServer
7
+ from kiln_ai.datamodel.extraction import Document, ExtractorConfig
8
+ from kiln_ai.datamodel.rag import RagConfig
5
9
  from kiln_ai.datamodel.task import Task
10
+ from kiln_ai.datamodel.vector_store import VectorStoreConfig
6
11
 
7
12
 
8
13
  class Project(
9
14
  KilnParentModel,
10
- parent_of={"tasks": Task, "external_tool_servers": ExternalToolServer},
15
+ parent_of={
16
+ "tasks": Task,
17
+ "documents": Document,
18
+ "extractor_configs": ExtractorConfig,
19
+ "chunker_configs": ChunkerConfig,
20
+ "embedding_configs": EmbeddingConfig,
21
+ "rag_configs": RagConfig,
22
+ "vector_store_configs": VectorStoreConfig,
23
+ "external_tool_servers": ExternalToolServer,
24
+ },
11
25
  ):
12
26
  """
13
27
  A collection of related tasks.
@@ -26,5 +40,23 @@ class Project(
26
40
  def tasks(self) -> list[Task]:
27
41
  return super().tasks() # type: ignore
28
42
 
43
+ def documents(self, readonly: bool = False) -> list[Document]:
44
+ return super().documents(readonly=readonly) # type: ignore
45
+
46
+ def extractor_configs(self, readonly: bool = False) -> list[ExtractorConfig]:
47
+ return super().extractor_configs(readonly=readonly) # type: ignore
48
+
49
+ def chunker_configs(self, readonly: bool = False) -> list[ChunkerConfig]:
50
+ return super().chunker_configs(readonly=readonly) # type: ignore
51
+
52
+ def embedding_configs(self, readonly: bool = False) -> list[EmbeddingConfig]:
53
+ return super().embedding_configs(readonly=readonly) # type: ignore
54
+
55
+ def vector_store_configs(self, readonly: bool = False) -> list[VectorStoreConfig]:
56
+ return super().vector_store_configs(readonly=readonly) # type: ignore
57
+
58
+ def rag_configs(self, readonly: bool = False) -> list[RagConfig]:
59
+ return super().rag_configs(readonly=readonly) # type: ignore
60
+
29
61
  def external_tool_servers(self, readonly: bool = False) -> list[ExternalToolServer]:
30
62
  return super().external_tool_servers(readonly=readonly) # type: ignore
@@ -0,0 +1,79 @@
1
+ from typing import TYPE_CHECKING, Union
2
+
3
+ from pydantic import Field, model_validator
4
+
5
+ from kiln_ai.datamodel.basemodel import ID_TYPE, FilenameString, KilnParentedModel
6
+ from kiln_ai.utils.validation import ToolNameString
7
+
8
+ if TYPE_CHECKING:
9
+ from kiln_ai.datamodel.project import Project
10
+
11
+
12
+ class RagConfig(KilnParentedModel):
13
+ name: FilenameString = Field(
14
+ description="A name to identify this RAG configuration for your own reference.",
15
+ )
16
+
17
+ is_archived: bool = Field(
18
+ default=False,
19
+ description="Whether the RAG configuration is archived. Archived RAG configurations are not shown in the UI and are not available for use.",
20
+ )
21
+
22
+ description: str | None = Field(
23
+ default=None,
24
+ description="A description of the RAG configuration for you and your team. Will not be used in prompts/training/validation.",
25
+ )
26
+
27
+ tool_name: ToolNameString = Field(
28
+ description="A name for the model to identify the Search Tool in conversations.",
29
+ )
30
+
31
+ tool_description: str = Field(
32
+ description="A description of the purpose of the tool. The model will use this description to understand the tool's capabilities.",
33
+ max_length=128,
34
+ )
35
+
36
+ extractor_config_id: ID_TYPE = Field(
37
+ description="The ID of the extractor config used to extract the documents.",
38
+ )
39
+
40
+ chunker_config_id: ID_TYPE = Field(
41
+ description="The ID of the chunker config used to chunk the documents.",
42
+ )
43
+
44
+ embedding_config_id: ID_TYPE = Field(
45
+ description="The ID of the embedding config used to embed the documents.",
46
+ )
47
+
48
+ vector_store_config_id: ID_TYPE = Field(
49
+ description="The ID of the vector store config used to store the documents.",
50
+ )
51
+
52
+ tags: list[str] | None = Field(
53
+ default=None,
54
+ description="List of document tags to filter by. If None, all documents in the project are used.",
55
+ )
56
+
57
+ # Workaround to return typed parent without importing Project
58
+ def parent_project(self) -> Union["Project", None]:
59
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
60
+ return None
61
+ return self.parent # type: ignore
62
+
63
+ @model_validator(mode="after")
64
+ def validate_tags(self):
65
+ if self.tags is not None:
66
+ if len(self.tags) == 0:
67
+ raise ValueError("Tags cannot be an empty list.")
68
+ for tag in self.tags:
69
+ if not tag:
70
+ raise ValueError("Tags cannot be empty.")
71
+ if " " in tag:
72
+ raise ValueError("Tags cannot contain spaces. Try underscores.")
73
+
74
+ if self.tool_name.strip() == "":
75
+ raise ValueError("Tool name cannot be empty.")
76
+ if self.tool_description.strip() == "":
77
+ raise ValueError("Tool description cannot be empty.")
78
+
79
+ return self
kiln_ai/datamodel/task.py CHANGED
@@ -131,6 +131,11 @@ class Task(
131
131
  description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
132
132
  )
133
133
 
134
+ default_run_config_id: ID_TYPE | None = Field(
135
+ default=None,
136
+ description="ID of the run config to use for this task by default. Must exist in saved run configs for this task.",
137
+ )
138
+
134
139
  def output_schema(self) -> Dict | None:
135
140
  if self.output_json_schema is None:
136
141
  return None
@@ -171,6 +171,7 @@ class DataSourceType(str, Enum):
171
171
  human = "human"
172
172
  synthetic = "synthetic"
173
173
  file_import = "file_import"
174
+ tool_call = "tool_call"
174
175
 
175
176
 
176
177
  class DataSourceProperty(BaseModel):
@@ -189,16 +190,17 @@ class DataSourceProperty(BaseModel):
189
190
 
190
191
  class DataSource(BaseModel):
191
192
  """
192
- Represents the origin of data, either human or synthetic, with associated properties.
193
+ Represents the origin of data, either human, synthetic, file import, or tool call, with associated properties.
193
194
 
194
- Properties vary based on the source type - for synthetic sources this includes
195
- model information, for human sources this includes creator information.
195
+ Properties vary based on the source type - for synthetic/tool_call sources this includes
196
+ model information, for human sources this includes creator information, for file imports
197
+ this includes file information.
196
198
  """
197
199
 
198
200
  type: DataSourceType
199
201
  properties: Dict[str, str | int | float] = Field(
200
202
  default={},
201
- description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
203
+ description="Properties describing the data source. For synthetic things like model. For human: the human's name. For file_import: file information.",
202
204
  )
203
205
  run_config: Optional[RunConfigProperties] = Field(
204
206
  default=None,
@@ -210,43 +212,71 @@ class DataSource(BaseModel):
210
212
  name="created_by",
211
213
  type=str,
212
214
  required_for=[DataSourceType.human],
213
- not_allowed_for=[DataSourceType.synthetic, DataSourceType.file_import],
215
+ not_allowed_for=[
216
+ DataSourceType.synthetic,
217
+ DataSourceType.file_import,
218
+ DataSourceType.tool_call,
219
+ ],
214
220
  ),
215
221
  DataSourceProperty(
216
222
  name="model_name",
217
223
  type=str,
218
224
  required_for=[DataSourceType.synthetic],
219
- not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
225
+ not_allowed_for=[
226
+ DataSourceType.human,
227
+ DataSourceType.file_import,
228
+ DataSourceType.tool_call,
229
+ ],
220
230
  ),
221
231
  DataSourceProperty(
222
232
  name="model_provider",
223
233
  type=str,
224
234
  required_for=[DataSourceType.synthetic],
225
- not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
235
+ not_allowed_for=[
236
+ DataSourceType.human,
237
+ DataSourceType.file_import,
238
+ DataSourceType.tool_call,
239
+ ],
226
240
  ),
227
241
  DataSourceProperty(
228
242
  name="adapter_name",
229
243
  type=str,
230
244
  required_for=[DataSourceType.synthetic],
231
- not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
245
+ not_allowed_for=[
246
+ DataSourceType.human,
247
+ DataSourceType.file_import,
248
+ DataSourceType.tool_call,
249
+ ],
232
250
  ),
233
251
  DataSourceProperty(
234
252
  # Legacy field -- allow loading from old runs, but we shouldn't be setting it.
235
253
  name="prompt_builder_name",
236
254
  type=str,
237
- not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
255
+ not_allowed_for=[
256
+ DataSourceType.human,
257
+ DataSourceType.file_import,
258
+ DataSourceType.tool_call,
259
+ ],
238
260
  ),
239
261
  DataSourceProperty(
240
262
  # The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details.
241
263
  name="prompt_id",
242
264
  type=str,
243
- not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
265
+ not_allowed_for=[
266
+ DataSourceType.human,
267
+ DataSourceType.file_import,
268
+ DataSourceType.tool_call,
269
+ ],
244
270
  ),
245
271
  DataSourceProperty(
246
272
  name="file_name",
247
273
  type=str,
248
274
  required_for=[DataSourceType.file_import],
249
- not_allowed_for=[DataSourceType.human, DataSourceType.synthetic],
275
+ not_allowed_for=[
276
+ DataSourceType.human,
277
+ DataSourceType.synthetic,
278
+ DataSourceType.tool_call,
279
+ ],
250
280
  ),
251
281
  ]
252
282