kiln-ai 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +6 -0
- kiln_ai/adapters/adapter_registry.py +43 -226
- kiln_ai/adapters/chunkers/__init__.py +13 -0
- kiln_ai/adapters/chunkers/base_chunker.py +42 -0
- kiln_ai/adapters/chunkers/chunker_registry.py +16 -0
- kiln_ai/adapters/chunkers/fixed_window_chunker.py +39 -0
- kiln_ai/adapters/chunkers/helpers.py +23 -0
- kiln_ai/adapters/chunkers/test_base_chunker.py +63 -0
- kiln_ai/adapters/chunkers/test_chunker_registry.py +28 -0
- kiln_ai/adapters/chunkers/test_fixed_window_chunker.py +346 -0
- kiln_ai/adapters/chunkers/test_helpers.py +75 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +9 -3
- kiln_ai/adapters/embedding/__init__.py +0 -0
- kiln_ai/adapters/embedding/base_embedding_adapter.py +44 -0
- kiln_ai/adapters/embedding/embedding_registry.py +32 -0
- kiln_ai/adapters/embedding/litellm_embedding_adapter.py +199 -0
- kiln_ai/adapters/embedding/test_base_embedding_adapter.py +283 -0
- kiln_ai/adapters/embedding/test_embedding_registry.py +166 -0
- kiln_ai/adapters/embedding/test_litellm_embedding_adapter.py +1149 -0
- kiln_ai/adapters/eval/eval_runner.py +6 -2
- kiln_ai/adapters/eval/test_base_eval.py +1 -3
- kiln_ai/adapters/eval/test_g_eval.py +1 -1
- kiln_ai/adapters/extractors/__init__.py +18 -0
- kiln_ai/adapters/extractors/base_extractor.py +72 -0
- kiln_ai/adapters/extractors/encoding.py +20 -0
- kiln_ai/adapters/extractors/extractor_registry.py +44 -0
- kiln_ai/adapters/extractors/extractor_runner.py +112 -0
- kiln_ai/adapters/extractors/litellm_extractor.py +406 -0
- kiln_ai/adapters/extractors/test_base_extractor.py +244 -0
- kiln_ai/adapters/extractors/test_encoding.py +54 -0
- kiln_ai/adapters/extractors/test_extractor_registry.py +181 -0
- kiln_ai/adapters/extractors/test_extractor_runner.py +181 -0
- kiln_ai/adapters/extractors/test_litellm_extractor.py +1290 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +2 -2
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +2 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +2 -6
- kiln_ai/adapters/ml_embedding_model_list.py +494 -0
- kiln_ai/adapters/ml_model_list.py +876 -18
- kiln_ai/adapters/model_adapters/litellm_adapter.py +40 -75
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +79 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter_tools.py +119 -5
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +9 -3
- kiln_ai/adapters/model_adapters/test_structured_output.py +9 -10
- kiln_ai/adapters/ollama_tools.py +69 -12
- kiln_ai/adapters/provider_tools.py +190 -46
- kiln_ai/adapters/rag/deduplication.py +49 -0
- kiln_ai/adapters/rag/progress.py +252 -0
- kiln_ai/adapters/rag/rag_runners.py +844 -0
- kiln_ai/adapters/rag/test_deduplication.py +195 -0
- kiln_ai/adapters/rag/test_progress.py +785 -0
- kiln_ai/adapters/rag/test_rag_runners.py +2376 -0
- kiln_ai/adapters/remote_config.py +80 -8
- kiln_ai/adapters/test_adapter_registry.py +579 -86
- kiln_ai/adapters/test_ml_embedding_model_list.py +239 -0
- kiln_ai/adapters/test_ml_model_list.py +202 -0
- kiln_ai/adapters/test_ollama_tools.py +340 -1
- kiln_ai/adapters/test_prompt_builders.py +1 -1
- kiln_ai/adapters/test_provider_tools.py +199 -8
- kiln_ai/adapters/test_remote_config.py +551 -56
- kiln_ai/adapters/vector_store/__init__.py +1 -0
- kiln_ai/adapters/vector_store/base_vector_store_adapter.py +83 -0
- kiln_ai/adapters/vector_store/lancedb_adapter.py +389 -0
- kiln_ai/adapters/vector_store/test_base_vector_store.py +160 -0
- kiln_ai/adapters/vector_store/test_lancedb_adapter.py +1841 -0
- kiln_ai/adapters/vector_store/test_vector_store_registry.py +199 -0
- kiln_ai/adapters/vector_store/vector_store_registry.py +33 -0
- kiln_ai/datamodel/__init__.py +16 -13
- kiln_ai/datamodel/basemodel.py +201 -4
- kiln_ai/datamodel/chunk.py +158 -0
- kiln_ai/datamodel/datamodel_enums.py +27 -0
- kiln_ai/datamodel/embedding.py +64 -0
- kiln_ai/datamodel/external_tool_server.py +206 -54
- kiln_ai/datamodel/extraction.py +317 -0
- kiln_ai/datamodel/project.py +33 -1
- kiln_ai/datamodel/rag.py +79 -0
- kiln_ai/datamodel/task.py +5 -0
- kiln_ai/datamodel/task_output.py +41 -11
- kiln_ai/datamodel/test_attachment.py +649 -0
- kiln_ai/datamodel/test_basemodel.py +270 -14
- kiln_ai/datamodel/test_chunk_models.py +317 -0
- kiln_ai/datamodel/test_dataset_split.py +1 -1
- kiln_ai/datamodel/test_datasource.py +50 -0
- kiln_ai/datamodel/test_embedding_models.py +448 -0
- kiln_ai/datamodel/test_eval_model.py +6 -6
- kiln_ai/datamodel/test_external_tool_server.py +534 -152
- kiln_ai/datamodel/test_extraction_chunk.py +206 -0
- kiln_ai/datamodel/test_extraction_model.py +501 -0
- kiln_ai/datamodel/test_rag.py +641 -0
- kiln_ai/datamodel/test_task.py +35 -1
- kiln_ai/datamodel/test_tool_id.py +187 -1
- kiln_ai/datamodel/test_vector_store.py +320 -0
- kiln_ai/datamodel/tool_id.py +58 -0
- kiln_ai/datamodel/vector_store.py +141 -0
- kiln_ai/tools/base_tool.py +12 -3
- kiln_ai/tools/built_in_tools/math_tools.py +12 -4
- kiln_ai/tools/kiln_task_tool.py +158 -0
- kiln_ai/tools/mcp_server_tool.py +2 -2
- kiln_ai/tools/mcp_session_manager.py +51 -22
- kiln_ai/tools/rag_tools.py +164 -0
- kiln_ai/tools/test_kiln_task_tool.py +527 -0
- kiln_ai/tools/test_mcp_server_tool.py +4 -15
- kiln_ai/tools/test_mcp_session_manager.py +187 -227
- kiln_ai/tools/test_rag_tools.py +929 -0
- kiln_ai/tools/test_tool_registry.py +290 -7
- kiln_ai/tools/tool_registry.py +69 -16
- kiln_ai/utils/__init__.py +3 -0
- kiln_ai/utils/async_job_runner.py +62 -17
- kiln_ai/utils/config.py +2 -2
- kiln_ai/utils/env.py +15 -0
- kiln_ai/utils/filesystem.py +14 -0
- kiln_ai/utils/filesystem_cache.py +60 -0
- kiln_ai/utils/litellm.py +94 -0
- kiln_ai/utils/lock.py +100 -0
- kiln_ai/utils/mime_type.py +38 -0
- kiln_ai/utils/open_ai_types.py +19 -2
- kiln_ai/utils/pdf_utils.py +59 -0
- kiln_ai/utils/test_async_job_runner.py +151 -35
- kiln_ai/utils/test_env.py +142 -0
- kiln_ai/utils/test_filesystem_cache.py +316 -0
- kiln_ai/utils/test_litellm.py +206 -0
- kiln_ai/utils/test_lock.py +185 -0
- kiln_ai/utils/test_mime_type.py +66 -0
- kiln_ai/utils/test_open_ai_types.py +88 -12
- kiln_ai/utils/test_pdf_utils.py +86 -0
- kiln_ai/utils/test_uuid.py +111 -0
- kiln_ai/utils/test_validation.py +524 -0
- kiln_ai/utils/uuid.py +9 -0
- kiln_ai/utils/validation.py +90 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/METADATA +9 -1
- kiln_ai-0.22.0.dist-info/RECORD +213 -0
- kiln_ai-0.20.1.dist-info/RECORD +0 -138
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.20.1.dist-info → kiln_ai-0.22.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Any, List, Union
|
|
4
|
+
|
|
5
|
+
import anyio
|
|
6
|
+
from pydantic import (
|
|
7
|
+
BaseModel,
|
|
8
|
+
Field,
|
|
9
|
+
SerializationInfo,
|
|
10
|
+
ValidationInfo,
|
|
11
|
+
computed_field,
|
|
12
|
+
field_serializer,
|
|
13
|
+
field_validator,
|
|
14
|
+
model_validator,
|
|
15
|
+
)
|
|
16
|
+
from typing_extensions import Self
|
|
17
|
+
|
|
18
|
+
from kiln_ai.datamodel.basemodel import (
|
|
19
|
+
ID_TYPE,
|
|
20
|
+
FilenameString,
|
|
21
|
+
KilnAttachmentModel,
|
|
22
|
+
KilnParentedModel,
|
|
23
|
+
KilnParentModel,
|
|
24
|
+
)
|
|
25
|
+
from kiln_ai.datamodel.chunk import ChunkedDocument
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from kiln_ai.datamodel.project import Project
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Kind(str, Enum):
|
|
36
|
+
DOCUMENT = "document"
|
|
37
|
+
IMAGE = "image"
|
|
38
|
+
VIDEO = "video"
|
|
39
|
+
AUDIO = "audio"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class OutputFormat(str, Enum):
|
|
43
|
+
TEXT = "text/plain"
|
|
44
|
+
MARKDOWN = "text/markdown"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ExtractorType(str, Enum):
|
|
48
|
+
LITELLM = "litellm"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
SUPPORTED_MIME_TYPES = {
|
|
52
|
+
Kind.DOCUMENT: {
|
|
53
|
+
"application/pdf",
|
|
54
|
+
"text/plain",
|
|
55
|
+
"text/markdown",
|
|
56
|
+
"text/html",
|
|
57
|
+
"text/md",
|
|
58
|
+
},
|
|
59
|
+
Kind.IMAGE: {
|
|
60
|
+
"image/png",
|
|
61
|
+
"image/jpeg",
|
|
62
|
+
},
|
|
63
|
+
Kind.VIDEO: {
|
|
64
|
+
"video/mp4",
|
|
65
|
+
"video/quicktime",
|
|
66
|
+
},
|
|
67
|
+
Kind.AUDIO: {
|
|
68
|
+
"audio/wav",
|
|
69
|
+
"audio/mpeg",
|
|
70
|
+
"audio/ogg",
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ExtractionModel(BaseModel):
|
|
76
|
+
name: str
|
|
77
|
+
label: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def validate_prompt(prompt: Any, name: str):
|
|
81
|
+
if not isinstance(prompt, str):
|
|
82
|
+
raise ValueError(f"{name} must be a string.")
|
|
83
|
+
if prompt == "":
|
|
84
|
+
raise ValueError(f"{name} cannot be empty.")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ExtractionSource(str, Enum):
|
|
88
|
+
PROCESSED = "processed"
|
|
89
|
+
PASSTHROUGH = "passthrough"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class Extraction(
|
|
93
|
+
KilnParentedModel, KilnParentModel, parent_of={"chunked_documents": ChunkedDocument}
|
|
94
|
+
):
|
|
95
|
+
source: ExtractionSource = Field(
|
|
96
|
+
description="The source of the extraction.",
|
|
97
|
+
)
|
|
98
|
+
extractor_config_id: ID_TYPE = Field(
|
|
99
|
+
description="The ID of the extractor config used to extract the data.",
|
|
100
|
+
)
|
|
101
|
+
output: KilnAttachmentModel = Field(
|
|
102
|
+
description="The extraction output.",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def parent_document(self) -> Union["Document", None]:
|
|
106
|
+
if self.parent is None or self.parent.__class__.__name__ != "Document":
|
|
107
|
+
return None
|
|
108
|
+
return self.parent # type: ignore
|
|
109
|
+
|
|
110
|
+
async def output_content(self) -> str | None:
|
|
111
|
+
if not self.path:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"Failed to resolve the path of extraction output attachment because the extraction does not have a path."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
full_path = self.output.resolve_path(self.path.parent)
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
return await anyio.Path(full_path).read_text(encoding="utf-8")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(
|
|
122
|
+
f"Failed to read extraction output for {full_path}: {e}", exc_info=True
|
|
123
|
+
)
|
|
124
|
+
raise ValueError(f"Failed to read extraction output: {e}")
|
|
125
|
+
|
|
126
|
+
def chunked_documents(self, readonly: bool = False) -> list[ChunkedDocument]:
|
|
127
|
+
return super().chunked_documents(readonly=readonly) # type: ignore
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class ExtractorConfig(KilnParentedModel):
|
|
131
|
+
name: FilenameString = Field(
|
|
132
|
+
description="A name to identify the extractor config.",
|
|
133
|
+
)
|
|
134
|
+
is_archived: bool = Field(
|
|
135
|
+
default=False,
|
|
136
|
+
description="Whether the extractor config is archived. Archived extractor configs are not shown in the UI and are not available for use.",
|
|
137
|
+
)
|
|
138
|
+
description: str | None = Field(
|
|
139
|
+
default=None, description="The description of the extractor config"
|
|
140
|
+
)
|
|
141
|
+
model_provider_name: str = Field(
|
|
142
|
+
description="The name of the model provider to use for the extractor config.",
|
|
143
|
+
)
|
|
144
|
+
model_name: str = Field(
|
|
145
|
+
description="The name of the model to use for the extractor config.",
|
|
146
|
+
)
|
|
147
|
+
output_format: OutputFormat = Field(
|
|
148
|
+
default=OutputFormat.MARKDOWN,
|
|
149
|
+
description="The format to use for the output.",
|
|
150
|
+
)
|
|
151
|
+
passthrough_mimetypes: list[OutputFormat] = Field(
|
|
152
|
+
default_factory=list,
|
|
153
|
+
description="If the mimetype is in this list, the extractor will not be used and the text content of the file will be returned as is.",
|
|
154
|
+
)
|
|
155
|
+
extractor_type: ExtractorType = Field(
|
|
156
|
+
description="This is used to determine the type of extractor to use.",
|
|
157
|
+
)
|
|
158
|
+
properties: dict[str, str | int | float | bool | dict[str, str] | None] = Field(
|
|
159
|
+
default_factory=dict,
|
|
160
|
+
description="Properties to be used to execute the extractor config. This is extractor_type specific and should serialize to a json dict.",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
@field_validator("properties")
|
|
164
|
+
@classmethod
|
|
165
|
+
def validate_properties(
|
|
166
|
+
cls, properties: dict[str, Any], info: ValidationInfo
|
|
167
|
+
) -> dict[str, Any]:
|
|
168
|
+
def get_property(key: str) -> str:
|
|
169
|
+
value = properties.get(key)
|
|
170
|
+
if value is None or value == "" or not isinstance(value, str):
|
|
171
|
+
raise ValueError(f"Prompt for {key} must be a string")
|
|
172
|
+
return value
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
"prompt_document": get_property(
|
|
176
|
+
"prompt_document",
|
|
177
|
+
),
|
|
178
|
+
"prompt_image": get_property(
|
|
179
|
+
"prompt_image",
|
|
180
|
+
),
|
|
181
|
+
"prompt_video": get_property(
|
|
182
|
+
"prompt_video",
|
|
183
|
+
),
|
|
184
|
+
"prompt_audio": get_property(
|
|
185
|
+
"prompt_audio",
|
|
186
|
+
),
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
def prompt_document(self) -> str | None:
|
|
190
|
+
prompt = self.properties.get("prompt_document")
|
|
191
|
+
if prompt is None:
|
|
192
|
+
return None
|
|
193
|
+
if not isinstance(prompt, str):
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"Invalid prompt_document. prompt_document must be a string."
|
|
196
|
+
)
|
|
197
|
+
return prompt
|
|
198
|
+
|
|
199
|
+
def prompt_video(self) -> str | None:
|
|
200
|
+
prompt = self.properties.get("prompt_video")
|
|
201
|
+
if prompt is None:
|
|
202
|
+
return None
|
|
203
|
+
if not isinstance(prompt, str):
|
|
204
|
+
raise ValueError("Invalid prompt_video. prompt_video must be a string.")
|
|
205
|
+
return prompt
|
|
206
|
+
|
|
207
|
+
def prompt_audio(self) -> str | None:
|
|
208
|
+
prompt = self.properties.get("prompt_audio")
|
|
209
|
+
if prompt is None:
|
|
210
|
+
return None
|
|
211
|
+
if not isinstance(prompt, str):
|
|
212
|
+
raise ValueError("Invalid prompt_audio. prompt_audio must be a string.")
|
|
213
|
+
return prompt
|
|
214
|
+
|
|
215
|
+
def prompt_image(self) -> str | None:
|
|
216
|
+
prompt = self.properties.get("prompt_image")
|
|
217
|
+
if prompt is None:
|
|
218
|
+
return None
|
|
219
|
+
if not isinstance(prompt, str):
|
|
220
|
+
raise ValueError("Invalid prompt_image. prompt_image must be a string.")
|
|
221
|
+
return prompt
|
|
222
|
+
|
|
223
|
+
# Workaround to return typed parent without importing Project
|
|
224
|
+
def parent_project(self) -> Union["Project", None]:
|
|
225
|
+
if self.parent is None or self.parent.__class__.__name__ != "Project":
|
|
226
|
+
return None
|
|
227
|
+
return self.parent # type: ignore
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class FileInfo(BaseModel):
|
|
231
|
+
filename: str = Field(description="The filename of the file")
|
|
232
|
+
|
|
233
|
+
size: int = Field(description="The size of the file in bytes")
|
|
234
|
+
|
|
235
|
+
mime_type: str = Field(description="The MIME type of the file")
|
|
236
|
+
|
|
237
|
+
attachment: KilnAttachmentModel = Field(
|
|
238
|
+
description="The attachment to the file",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@field_serializer("attachment")
|
|
242
|
+
def serialize_attachment(
|
|
243
|
+
self, attachment: KilnAttachmentModel, info: SerializationInfo
|
|
244
|
+
) -> dict:
|
|
245
|
+
context = info.context or {}
|
|
246
|
+
context["filename_prefix"] = "attachment"
|
|
247
|
+
return attachment.model_dump(mode="json", context=context)
|
|
248
|
+
|
|
249
|
+
@field_validator("mime_type")
|
|
250
|
+
@classmethod
|
|
251
|
+
def validate_mime_type(cls, mime_type: str, info: ValidationInfo) -> str:
|
|
252
|
+
filename = info.data.get("filename") or ""
|
|
253
|
+
|
|
254
|
+
for mime_types in SUPPORTED_MIME_TYPES.values():
|
|
255
|
+
if mime_type in mime_types:
|
|
256
|
+
return mime_type
|
|
257
|
+
raise ValueError(f"MIME type is not supported: {mime_type} (for {filename})")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class Document(
|
|
261
|
+
KilnParentedModel, KilnParentModel, parent_of={"extractions": Extraction}
|
|
262
|
+
):
|
|
263
|
+
# this field should not be changed after creation
|
|
264
|
+
name: FilenameString = Field(
|
|
265
|
+
description="A name to identify the document.",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# this field can be changed after creation
|
|
269
|
+
name_override: str | None = Field(
|
|
270
|
+
description="A friendly name to identify the document. This is used for display purposes and can be different from the name.",
|
|
271
|
+
default=None,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
description: str = Field(description="A description for the file")
|
|
275
|
+
|
|
276
|
+
original_file: FileInfo = Field(description="The original file")
|
|
277
|
+
|
|
278
|
+
kind: Kind = Field(
|
|
279
|
+
description="The kind of document. The kind is a broad family of filetypes that can be handled in a similar way"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
tags: List[str] = Field(
|
|
283
|
+
default_factory=list,
|
|
284
|
+
description="Tags for the document. Tags are used to categorize documents for filtering and reporting.",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@model_validator(mode="after")
|
|
288
|
+
def validate_tags(self) -> Self:
|
|
289
|
+
for tag in self.tags:
|
|
290
|
+
if not tag:
|
|
291
|
+
raise ValueError("Tags cannot be empty strings")
|
|
292
|
+
if " " in tag:
|
|
293
|
+
raise ValueError("Tags cannot contain spaces. Try underscores.")
|
|
294
|
+
|
|
295
|
+
return self
|
|
296
|
+
|
|
297
|
+
# Workaround to return typed parent without importing Project
|
|
298
|
+
def parent_project(self) -> Union["Project", None]:
|
|
299
|
+
if self.parent is None or self.parent.__class__.__name__ != "Project":
|
|
300
|
+
return None
|
|
301
|
+
return self.parent # type: ignore
|
|
302
|
+
|
|
303
|
+
def extractions(self, readonly: bool = False) -> list[Extraction]:
|
|
304
|
+
return super().extractions(readonly=readonly) # type: ignore
|
|
305
|
+
|
|
306
|
+
@computed_field
|
|
307
|
+
@property
|
|
308
|
+
def friendly_name(self) -> str:
|
|
309
|
+
# backward compatibility: old documents did not have name_override
|
|
310
|
+
return self.name_override or self.name
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def get_kind_from_mime_type(mime_type: str) -> Kind | None:
|
|
314
|
+
for kind, mime_types in SUPPORTED_MIME_TYPES.items():
|
|
315
|
+
if mime_type in mime_types:
|
|
316
|
+
return kind
|
|
317
|
+
return None
|
kiln_ai/datamodel/project.py
CHANGED
|
@@ -1,13 +1,27 @@
|
|
|
1
1
|
from pydantic import Field
|
|
2
2
|
|
|
3
3
|
from kiln_ai.datamodel.basemodel import FilenameString, KilnParentModel
|
|
4
|
+
from kiln_ai.datamodel.chunk import ChunkerConfig
|
|
5
|
+
from kiln_ai.datamodel.embedding import EmbeddingConfig
|
|
4
6
|
from kiln_ai.datamodel.external_tool_server import ExternalToolServer
|
|
7
|
+
from kiln_ai.datamodel.extraction import Document, ExtractorConfig
|
|
8
|
+
from kiln_ai.datamodel.rag import RagConfig
|
|
5
9
|
from kiln_ai.datamodel.task import Task
|
|
10
|
+
from kiln_ai.datamodel.vector_store import VectorStoreConfig
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
class Project(
|
|
9
14
|
KilnParentModel,
|
|
10
|
-
parent_of={
|
|
15
|
+
parent_of={
|
|
16
|
+
"tasks": Task,
|
|
17
|
+
"documents": Document,
|
|
18
|
+
"extractor_configs": ExtractorConfig,
|
|
19
|
+
"chunker_configs": ChunkerConfig,
|
|
20
|
+
"embedding_configs": EmbeddingConfig,
|
|
21
|
+
"rag_configs": RagConfig,
|
|
22
|
+
"vector_store_configs": VectorStoreConfig,
|
|
23
|
+
"external_tool_servers": ExternalToolServer,
|
|
24
|
+
},
|
|
11
25
|
):
|
|
12
26
|
"""
|
|
13
27
|
A collection of related tasks.
|
|
@@ -26,5 +40,23 @@ class Project(
|
|
|
26
40
|
def tasks(self) -> list[Task]:
|
|
27
41
|
return super().tasks() # type: ignore
|
|
28
42
|
|
|
43
|
+
def documents(self, readonly: bool = False) -> list[Document]:
|
|
44
|
+
return super().documents(readonly=readonly) # type: ignore
|
|
45
|
+
|
|
46
|
+
def extractor_configs(self, readonly: bool = False) -> list[ExtractorConfig]:
|
|
47
|
+
return super().extractor_configs(readonly=readonly) # type: ignore
|
|
48
|
+
|
|
49
|
+
def chunker_configs(self, readonly: bool = False) -> list[ChunkerConfig]:
|
|
50
|
+
return super().chunker_configs(readonly=readonly) # type: ignore
|
|
51
|
+
|
|
52
|
+
def embedding_configs(self, readonly: bool = False) -> list[EmbeddingConfig]:
|
|
53
|
+
return super().embedding_configs(readonly=readonly) # type: ignore
|
|
54
|
+
|
|
55
|
+
def vector_store_configs(self, readonly: bool = False) -> list[VectorStoreConfig]:
|
|
56
|
+
return super().vector_store_configs(readonly=readonly) # type: ignore
|
|
57
|
+
|
|
58
|
+
def rag_configs(self, readonly: bool = False) -> list[RagConfig]:
|
|
59
|
+
return super().rag_configs(readonly=readonly) # type: ignore
|
|
60
|
+
|
|
29
61
|
def external_tool_servers(self, readonly: bool = False) -> list[ExternalToolServer]:
|
|
30
62
|
return super().external_tool_servers(readonly=readonly) # type: ignore
|
kiln_ai/datamodel/rag.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, model_validator
|
|
4
|
+
|
|
5
|
+
from kiln_ai.datamodel.basemodel import ID_TYPE, FilenameString, KilnParentedModel
|
|
6
|
+
from kiln_ai.utils.validation import ToolNameString
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from kiln_ai.datamodel.project import Project
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RagConfig(KilnParentedModel):
|
|
13
|
+
name: FilenameString = Field(
|
|
14
|
+
description="A name to identify this RAG configuration for your own reference.",
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
is_archived: bool = Field(
|
|
18
|
+
default=False,
|
|
19
|
+
description="Whether the RAG configuration is archived. Archived RAG configurations are not shown in the UI and are not available for use.",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
description: str | None = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
description="A description of the RAG configuration for you and your team. Will not be used in prompts/training/validation.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
tool_name: ToolNameString = Field(
|
|
28
|
+
description="A name for the model to identify the Search Tool in conversations.",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
tool_description: str = Field(
|
|
32
|
+
description="A description of the purpose of the tool. The model will use this description to understand the tool's capabilities.",
|
|
33
|
+
max_length=128,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
extractor_config_id: ID_TYPE = Field(
|
|
37
|
+
description="The ID of the extractor config used to extract the documents.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
chunker_config_id: ID_TYPE = Field(
|
|
41
|
+
description="The ID of the chunker config used to chunk the documents.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
embedding_config_id: ID_TYPE = Field(
|
|
45
|
+
description="The ID of the embedding config used to embed the documents.",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
vector_store_config_id: ID_TYPE = Field(
|
|
49
|
+
description="The ID of the vector store config used to store the documents.",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
tags: list[str] | None = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="List of document tags to filter by. If None, all documents in the project are used.",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Workaround to return typed parent without importing Project
|
|
58
|
+
def parent_project(self) -> Union["Project", None]:
|
|
59
|
+
if self.parent is None or self.parent.__class__.__name__ != "Project":
|
|
60
|
+
return None
|
|
61
|
+
return self.parent # type: ignore
|
|
62
|
+
|
|
63
|
+
@model_validator(mode="after")
|
|
64
|
+
def validate_tags(self):
|
|
65
|
+
if self.tags is not None:
|
|
66
|
+
if len(self.tags) == 0:
|
|
67
|
+
raise ValueError("Tags cannot be an empty list.")
|
|
68
|
+
for tag in self.tags:
|
|
69
|
+
if not tag:
|
|
70
|
+
raise ValueError("Tags cannot be empty.")
|
|
71
|
+
if " " in tag:
|
|
72
|
+
raise ValueError("Tags cannot contain spaces. Try underscores.")
|
|
73
|
+
|
|
74
|
+
if self.tool_name.strip() == "":
|
|
75
|
+
raise ValueError("Tool name cannot be empty.")
|
|
76
|
+
if self.tool_description.strip() == "":
|
|
77
|
+
raise ValueError("Tool description cannot be empty.")
|
|
78
|
+
|
|
79
|
+
return self
|
kiln_ai/datamodel/task.py
CHANGED
|
@@ -131,6 +131,11 @@ class Task(
|
|
|
131
131
|
description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
+
default_run_config_id: ID_TYPE | None = Field(
|
|
135
|
+
default=None,
|
|
136
|
+
description="ID of the run config to use for this task by default. Must exist in saved run configs for this task.",
|
|
137
|
+
)
|
|
138
|
+
|
|
134
139
|
def output_schema(self) -> Dict | None:
|
|
135
140
|
if self.output_json_schema is None:
|
|
136
141
|
return None
|
kiln_ai/datamodel/task_output.py
CHANGED
|
@@ -171,6 +171,7 @@ class DataSourceType(str, Enum):
|
|
|
171
171
|
human = "human"
|
|
172
172
|
synthetic = "synthetic"
|
|
173
173
|
file_import = "file_import"
|
|
174
|
+
tool_call = "tool_call"
|
|
174
175
|
|
|
175
176
|
|
|
176
177
|
class DataSourceProperty(BaseModel):
|
|
@@ -189,16 +190,17 @@ class DataSourceProperty(BaseModel):
|
|
|
189
190
|
|
|
190
191
|
class DataSource(BaseModel):
|
|
191
192
|
"""
|
|
192
|
-
Represents the origin of data, either human or
|
|
193
|
+
Represents the origin of data, either human, synthetic, file import, or tool call, with associated properties.
|
|
193
194
|
|
|
194
|
-
Properties vary based on the source type - for synthetic sources this includes
|
|
195
|
-
model information, for human sources this includes creator information
|
|
195
|
+
Properties vary based on the source type - for synthetic/tool_call sources this includes
|
|
196
|
+
model information, for human sources this includes creator information, for file imports
|
|
197
|
+
this includes file information.
|
|
196
198
|
"""
|
|
197
199
|
|
|
198
200
|
type: DataSourceType
|
|
199
201
|
properties: Dict[str, str | int | float] = Field(
|
|
200
202
|
default={},
|
|
201
|
-
description="Properties describing the data source. For synthetic things like model. For human
|
|
203
|
+
description="Properties describing the data source. For synthetic things like model. For human: the human's name. For file_import: file information.",
|
|
202
204
|
)
|
|
203
205
|
run_config: Optional[RunConfigProperties] = Field(
|
|
204
206
|
default=None,
|
|
@@ -210,43 +212,71 @@ class DataSource(BaseModel):
|
|
|
210
212
|
name="created_by",
|
|
211
213
|
type=str,
|
|
212
214
|
required_for=[DataSourceType.human],
|
|
213
|
-
not_allowed_for=[
|
|
215
|
+
not_allowed_for=[
|
|
216
|
+
DataSourceType.synthetic,
|
|
217
|
+
DataSourceType.file_import,
|
|
218
|
+
DataSourceType.tool_call,
|
|
219
|
+
],
|
|
214
220
|
),
|
|
215
221
|
DataSourceProperty(
|
|
216
222
|
name="model_name",
|
|
217
223
|
type=str,
|
|
218
224
|
required_for=[DataSourceType.synthetic],
|
|
219
|
-
not_allowed_for=[
|
|
225
|
+
not_allowed_for=[
|
|
226
|
+
DataSourceType.human,
|
|
227
|
+
DataSourceType.file_import,
|
|
228
|
+
DataSourceType.tool_call,
|
|
229
|
+
],
|
|
220
230
|
),
|
|
221
231
|
DataSourceProperty(
|
|
222
232
|
name="model_provider",
|
|
223
233
|
type=str,
|
|
224
234
|
required_for=[DataSourceType.synthetic],
|
|
225
|
-
not_allowed_for=[
|
|
235
|
+
not_allowed_for=[
|
|
236
|
+
DataSourceType.human,
|
|
237
|
+
DataSourceType.file_import,
|
|
238
|
+
DataSourceType.tool_call,
|
|
239
|
+
],
|
|
226
240
|
),
|
|
227
241
|
DataSourceProperty(
|
|
228
242
|
name="adapter_name",
|
|
229
243
|
type=str,
|
|
230
244
|
required_for=[DataSourceType.synthetic],
|
|
231
|
-
not_allowed_for=[
|
|
245
|
+
not_allowed_for=[
|
|
246
|
+
DataSourceType.human,
|
|
247
|
+
DataSourceType.file_import,
|
|
248
|
+
DataSourceType.tool_call,
|
|
249
|
+
],
|
|
232
250
|
),
|
|
233
251
|
DataSourceProperty(
|
|
234
252
|
# Legacy field -- allow loading from old runs, but we shouldn't be setting it.
|
|
235
253
|
name="prompt_builder_name",
|
|
236
254
|
type=str,
|
|
237
|
-
not_allowed_for=[
|
|
255
|
+
not_allowed_for=[
|
|
256
|
+
DataSourceType.human,
|
|
257
|
+
DataSourceType.file_import,
|
|
258
|
+
DataSourceType.tool_call,
|
|
259
|
+
],
|
|
238
260
|
),
|
|
239
261
|
DataSourceProperty(
|
|
240
262
|
# The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details.
|
|
241
263
|
name="prompt_id",
|
|
242
264
|
type=str,
|
|
243
|
-
not_allowed_for=[
|
|
265
|
+
not_allowed_for=[
|
|
266
|
+
DataSourceType.human,
|
|
267
|
+
DataSourceType.file_import,
|
|
268
|
+
DataSourceType.tool_call,
|
|
269
|
+
],
|
|
244
270
|
),
|
|
245
271
|
DataSourceProperty(
|
|
246
272
|
name="file_name",
|
|
247
273
|
type=str,
|
|
248
274
|
required_for=[DataSourceType.file_import],
|
|
249
|
-
not_allowed_for=[
|
|
275
|
+
not_allowed_for=[
|
|
276
|
+
DataSourceType.human,
|
|
277
|
+
DataSourceType.synthetic,
|
|
278
|
+
DataSourceType.tool_call,
|
|
279
|
+
],
|
|
250
280
|
),
|
|
251
281
|
]
|
|
252
282
|
|