kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/adapter_registry.py +12 -13
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +141 -29
- kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
- kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +3 -3
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +10 -10
- kiln_ai/adapters/test_generate_docs.py +6 -6
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +17 -14
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +6 -0
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +10 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/datamodel/__init__.py
CHANGED
|
@@ -1,50 +1,59 @@
|
|
|
1
1
|
"""
|
|
2
|
-
See our docs for details about our datamodel:
|
|
3
|
-
"""
|
|
2
|
+
See our docs for details about our datamodel classes and hierarchy:
|
|
4
3
|
|
|
5
|
-
|
|
4
|
+
Developer docs: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
import random
|
|
10
|
-
from enum import Enum, IntEnum
|
|
11
|
-
from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
|
|
6
|
+
User docs: https://docs.getkiln.ai/developers/kiln-datamodel
|
|
7
|
+
"""
|
|
12
8
|
|
|
13
|
-
|
|
14
|
-
import
|
|
15
|
-
from pydantic import (
|
|
16
|
-
BaseModel,
|
|
17
|
-
Field,
|
|
18
|
-
ValidationInfo,
|
|
19
|
-
model_validator,
|
|
20
|
-
)
|
|
21
|
-
from typing_extensions import Self
|
|
9
|
+
# This component uses "flat" imports so we don't have too much internal structure exposed in the API.
|
|
10
|
+
# for example you can just `from datamodel import Task, Project` instead of `from datamodel.task import Task; from datamodel.project import Project`
|
|
22
11
|
|
|
23
|
-
from
|
|
12
|
+
from __future__ import annotations
|
|
24
13
|
|
|
25
|
-
from .
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
14
|
+
from kiln_ai.datamodel import dataset_split, eval, strict_mode
|
|
15
|
+
from kiln_ai.datamodel.datamodel_enums import (
|
|
16
|
+
FinetuneDataStrategy,
|
|
17
|
+
FineTuneStatusType,
|
|
18
|
+
Priority,
|
|
19
|
+
StructuredOutputMode,
|
|
20
|
+
TaskOutputRatingType,
|
|
21
|
+
)
|
|
22
|
+
from kiln_ai.datamodel.dataset_split import (
|
|
23
|
+
DatasetSplit,
|
|
24
|
+
DatasetSplitDefinition,
|
|
25
|
+
)
|
|
26
|
+
from kiln_ai.datamodel.finetune import (
|
|
27
|
+
Finetune,
|
|
28
|
+
)
|
|
29
|
+
from kiln_ai.datamodel.project import Project
|
|
30
|
+
from kiln_ai.datamodel.prompt import BasePrompt, Prompt
|
|
31
|
+
from kiln_ai.datamodel.prompt_id import (
|
|
32
|
+
PromptGenerators,
|
|
33
|
+
PromptId,
|
|
34
|
+
prompt_generator_values,
|
|
35
|
+
)
|
|
36
|
+
from kiln_ai.datamodel.task import Task, TaskRequirement
|
|
37
|
+
from kiln_ai.datamodel.task_output import (
|
|
38
|
+
DataSource,
|
|
39
|
+
DataSourceProperty,
|
|
40
|
+
DataSourceType,
|
|
41
|
+
RequirementRating,
|
|
42
|
+
TaskOutput,
|
|
43
|
+
TaskOutputRating,
|
|
44
|
+
)
|
|
45
|
+
from kiln_ai.datamodel.task_run import (
|
|
46
|
+
TaskRun,
|
|
33
47
|
)
|
|
34
|
-
from .json_schema import validate_schema
|
|
35
|
-
|
|
36
|
-
if TYPE_CHECKING:
|
|
37
|
-
from . import Task
|
|
38
|
-
|
|
39
48
|
|
|
40
49
|
__all__ = [
|
|
41
|
-
"
|
|
42
|
-
"
|
|
50
|
+
"strict_mode",
|
|
51
|
+
"dataset_split",
|
|
52
|
+
"eval",
|
|
43
53
|
"Task",
|
|
44
54
|
"Project",
|
|
45
55
|
"TaskRun",
|
|
46
56
|
"TaskOutput",
|
|
47
|
-
"TaskOutputRating",
|
|
48
57
|
"Priority",
|
|
49
58
|
"DataSource",
|
|
50
59
|
"DataSourceType",
|
|
@@ -53,927 +62,16 @@ __all__ = [
|
|
|
53
62
|
"FineTuneStatusType",
|
|
54
63
|
"TaskOutputRatingType",
|
|
55
64
|
"TaskRequirement",
|
|
56
|
-
"TaskDeterminism",
|
|
57
65
|
"DatasetSplitDefinition",
|
|
58
66
|
"DatasetSplit",
|
|
59
67
|
"RequirementRating",
|
|
60
68
|
"TaskRequirement",
|
|
61
|
-
"
|
|
62
|
-
"set_strict_mode",
|
|
69
|
+
"BasePrompt",
|
|
63
70
|
"Prompt",
|
|
71
|
+
"TaskOutputRating",
|
|
72
|
+
"StructuredOutputMode",
|
|
73
|
+
"FinetuneDataStrategy",
|
|
74
|
+
"PromptId",
|
|
75
|
+
"PromptGenerators",
|
|
76
|
+
"prompt_generator_values",
|
|
64
77
|
]
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
|
|
68
|
-
# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
|
|
69
|
-
_strict_mode: bool = False
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def strict_mode() -> bool:
|
|
73
|
-
return _strict_mode
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def set_strict_mode(value: bool) -> None:
|
|
77
|
-
global _strict_mode
|
|
78
|
-
_strict_mode = value
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
class Priority(IntEnum):
|
|
82
|
-
"""Defines priority levels for tasks and requirements, where P0 is highest priority."""
|
|
83
|
-
|
|
84
|
-
p0 = 0
|
|
85
|
-
p1 = 1
|
|
86
|
-
p2 = 2
|
|
87
|
-
p3 = 3
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# Only one rating type for now, but this allows for extensibility if we want to add more in the future
|
|
91
|
-
class TaskOutputRatingType(str, Enum):
|
|
92
|
-
"""Defines the types of rating systems available for task outputs."""
|
|
93
|
-
|
|
94
|
-
five_star = "five_star"
|
|
95
|
-
pass_fail = "pass_fail"
|
|
96
|
-
pass_fail_critical = "pass_fail_critical"
|
|
97
|
-
custom = "custom"
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class RequirementRating(BaseModel):
|
|
101
|
-
"""Rating for a specific requirement within a task output."""
|
|
102
|
-
|
|
103
|
-
value: float = Field(
|
|
104
|
-
description="The rating value. Interpretation depends on rating type"
|
|
105
|
-
)
|
|
106
|
-
type: TaskOutputRatingType = Field(description="The type of rating")
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class TaskOutputRating(KilnBaseModel):
|
|
110
|
-
"""
|
|
111
|
-
A rating for a task output, including an overall rating and ratings for each requirement.
|
|
112
|
-
|
|
113
|
-
Supports:
|
|
114
|
-
- five_star: 1-5 star ratings
|
|
115
|
-
- pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
|
|
116
|
-
- pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
120
|
-
value: float | None = Field(
|
|
121
|
-
description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
|
|
122
|
-
default=None,
|
|
123
|
-
)
|
|
124
|
-
requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
|
|
125
|
-
default={},
|
|
126
|
-
description="The ratings of the requirements of the task.",
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
# Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
|
|
130
|
-
@model_validator(mode="before")
|
|
131
|
-
def upgrade_old_format(cls, data: dict) -> dict:
|
|
132
|
-
if not isinstance(data, dict):
|
|
133
|
-
return data
|
|
134
|
-
|
|
135
|
-
# Check if we have the old format (dict of floats)
|
|
136
|
-
req_ratings = data.get("requirement_ratings", {})
|
|
137
|
-
if req_ratings and all(
|
|
138
|
-
isinstance(v, (int, float)) for v in req_ratings.values()
|
|
139
|
-
):
|
|
140
|
-
# Convert each float to a RequirementRating object
|
|
141
|
-
# all ratings are five star at the point we used this format
|
|
142
|
-
data["requirement_ratings"] = {
|
|
143
|
-
k: {"value": v, "type": TaskOutputRatingType.five_star}
|
|
144
|
-
for k, v in req_ratings.items()
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
return data
|
|
148
|
-
|
|
149
|
-
# Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
|
|
150
|
-
def is_high_quality(self) -> bool:
|
|
151
|
-
if self.value is None:
|
|
152
|
-
return False
|
|
153
|
-
|
|
154
|
-
if self.type == TaskOutputRatingType.five_star:
|
|
155
|
-
return self.value >= 4
|
|
156
|
-
elif self.type == TaskOutputRatingType.pass_fail:
|
|
157
|
-
return self.value == 1.0
|
|
158
|
-
elif self.type == TaskOutputRatingType.pass_fail_critical:
|
|
159
|
-
return self.value == 1.0
|
|
160
|
-
return False
|
|
161
|
-
|
|
162
|
-
@model_validator(mode="after")
|
|
163
|
-
def validate_rating(self) -> Self:
|
|
164
|
-
if self.type not in TaskOutputRatingType:
|
|
165
|
-
raise ValueError(f"Invalid rating type: {self.type}")
|
|
166
|
-
|
|
167
|
-
# Overall rating is optional
|
|
168
|
-
if self.value is not None:
|
|
169
|
-
self._validate_rating(self.type, self.value, "overall rating")
|
|
170
|
-
|
|
171
|
-
for req_id, req_rating in self.requirement_ratings.items():
|
|
172
|
-
self._validate_rating(
|
|
173
|
-
req_rating.type,
|
|
174
|
-
req_rating.value,
|
|
175
|
-
f"requirement rating for req ID: {req_id}",
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
return self
|
|
179
|
-
|
|
180
|
-
def _validate_rating(
|
|
181
|
-
self, type: TaskOutputRatingType, rating: float | None, rating_name: str
|
|
182
|
-
) -> None:
|
|
183
|
-
if type == TaskOutputRatingType.five_star:
|
|
184
|
-
self._validate_five_star(rating, rating_name)
|
|
185
|
-
elif type == TaskOutputRatingType.pass_fail:
|
|
186
|
-
self._validate_pass_fail(rating, rating_name)
|
|
187
|
-
elif type == TaskOutputRatingType.pass_fail_critical:
|
|
188
|
-
self._validate_pass_fail_critical(rating, rating_name)
|
|
189
|
-
|
|
190
|
-
def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
|
|
191
|
-
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
192
|
-
raise ValueError(
|
|
193
|
-
f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
|
|
194
|
-
)
|
|
195
|
-
if rating < 1 or rating > 5:
|
|
196
|
-
raise ValueError(
|
|
197
|
-
f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
|
|
201
|
-
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
202
|
-
raise ValueError(
|
|
203
|
-
f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
|
|
204
|
-
)
|
|
205
|
-
if rating not in [0, 1]:
|
|
206
|
-
raise ValueError(
|
|
207
|
-
f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
def _validate_pass_fail_critical(
|
|
211
|
-
self, rating: float | None, rating_name: str
|
|
212
|
-
) -> None:
|
|
213
|
-
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
214
|
-
raise ValueError(
|
|
215
|
-
f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
|
|
216
|
-
)
|
|
217
|
-
if rating not in [-1, 0, 1]:
|
|
218
|
-
raise ValueError(
|
|
219
|
-
f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
class TaskOutput(KilnBaseModel):
|
|
224
|
-
"""
|
|
225
|
-
An output for a specific task run.
|
|
226
|
-
|
|
227
|
-
Contains the actual output content, its source (human or synthetic),
|
|
228
|
-
and optional rating information.
|
|
229
|
-
"""
|
|
230
|
-
|
|
231
|
-
output: str = Field(
|
|
232
|
-
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
233
|
-
)
|
|
234
|
-
source: DataSource | None = Field(
|
|
235
|
-
description="The source of the output: human or synthetic.",
|
|
236
|
-
default=None,
|
|
237
|
-
)
|
|
238
|
-
rating: TaskOutputRating | None = Field(
|
|
239
|
-
default=None, description="The rating of the output"
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
def validate_output_format(self, task: Task) -> Self:
|
|
243
|
-
# validate output
|
|
244
|
-
if task.output_json_schema is not None:
|
|
245
|
-
try:
|
|
246
|
-
validate_schema(json.loads(self.output), task.output_json_schema)
|
|
247
|
-
except json.JSONDecodeError:
|
|
248
|
-
raise ValueError("Output is not a valid JSON object")
|
|
249
|
-
except jsonschema.exceptions.ValidationError as e:
|
|
250
|
-
raise ValueError(f"Output does not match task output schema: {e}")
|
|
251
|
-
return self
|
|
252
|
-
|
|
253
|
-
@model_validator(mode="after")
|
|
254
|
-
def validate_output_source(self, info: ValidationInfo) -> Self:
|
|
255
|
-
# On strict mode and not loaded from file, we validate output_source is not None.
|
|
256
|
-
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
257
|
-
if not strict_mode():
|
|
258
|
-
return self
|
|
259
|
-
if self.loaded_from_file(info):
|
|
260
|
-
return self
|
|
261
|
-
if self.source is None:
|
|
262
|
-
raise ValueError("Output source is required when strict mode is enabled")
|
|
263
|
-
return self
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class FineTuneStatusType(str, Enum):
|
|
267
|
-
"""
|
|
268
|
-
The status type of a fine-tune (running, completed, failed, etc).
|
|
269
|
-
"""
|
|
270
|
-
|
|
271
|
-
unknown = "unknown" # server error
|
|
272
|
-
pending = "pending"
|
|
273
|
-
running = "running"
|
|
274
|
-
completed = "completed"
|
|
275
|
-
failed = "failed"
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
class StructuredOutputMode(str, Enum):
|
|
279
|
-
"""
|
|
280
|
-
Enumeration of supported structured output modes.
|
|
281
|
-
|
|
282
|
-
- default: let the adapter decide
|
|
283
|
-
- json_schema: request json using API capabilities for json_schema
|
|
284
|
-
- function_calling: request json using API capabilities for function calling
|
|
285
|
-
- json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
|
|
286
|
-
- json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
|
|
287
|
-
- json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
|
|
288
|
-
"""
|
|
289
|
-
|
|
290
|
-
default = "default"
|
|
291
|
-
json_schema = "json_schema"
|
|
292
|
-
function_calling = "function_calling"
|
|
293
|
-
json_mode = "json_mode"
|
|
294
|
-
json_instructions = "json_instructions"
|
|
295
|
-
json_instruction_and_object = "json_instruction_and_object"
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
class FinetuneDataStrategy(str, Enum):
|
|
299
|
-
final_only = "final_only"
|
|
300
|
-
final_and_intermediate = "final_and_intermediate"
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
class Finetune(KilnParentedModel):
|
|
304
|
-
"""
|
|
305
|
-
The Kiln fine-tune datamodel.
|
|
306
|
-
|
|
307
|
-
Initially holds a reference to a training job, with needed identifiers to update the status. When complete, contains the new model ID.
|
|
308
|
-
"""
|
|
309
|
-
|
|
310
|
-
name: str = NAME_FIELD
|
|
311
|
-
description: str | None = Field(
|
|
312
|
-
default=None,
|
|
313
|
-
description="A description of the fine-tune for you and your team. Not used in training.",
|
|
314
|
-
)
|
|
315
|
-
structured_output_mode: StructuredOutputMode | None = Field(
|
|
316
|
-
default=None,
|
|
317
|
-
description="The mode to use to train the model for structured output, if it was trained with structured output. Will determine how we call the tuned model, so we call with the matching mode.",
|
|
318
|
-
)
|
|
319
|
-
provider: str = Field(
|
|
320
|
-
description="The provider to use for the fine-tune (e.g. 'openai')."
|
|
321
|
-
)
|
|
322
|
-
base_model_id: str = Field(
|
|
323
|
-
description="The id of the base model to use for the fine-tune. This string relates to the provider's IDs for their own models, not Kiln IDs."
|
|
324
|
-
)
|
|
325
|
-
provider_id: str | None = Field(
|
|
326
|
-
default=None,
|
|
327
|
-
description="The ID of the fine-tune job on the provider's side. May not be the same as the fine_tune_model_id.",
|
|
328
|
-
)
|
|
329
|
-
fine_tune_model_id: str | None = Field(
|
|
330
|
-
default=None,
|
|
331
|
-
description="The ID of the fine-tuned model on the provider's side. May not be the same as the provider_id.",
|
|
332
|
-
)
|
|
333
|
-
dataset_split_id: str = Field(
|
|
334
|
-
description="The ID of the dataset split to use for this fine-tune.",
|
|
335
|
-
)
|
|
336
|
-
train_split_name: str = Field(
|
|
337
|
-
default="train",
|
|
338
|
-
description="The name of the training split to use for this fine-tune.",
|
|
339
|
-
)
|
|
340
|
-
validation_split_name: str | None = Field(
|
|
341
|
-
default=None,
|
|
342
|
-
description="The name of the validation split to use for this fine-tune. Optional.",
|
|
343
|
-
)
|
|
344
|
-
parameters: dict[str, str | int | float | bool] = Field(
|
|
345
|
-
default={},
|
|
346
|
-
description="The parameters to use for this fine-tune. These are provider-specific.",
|
|
347
|
-
)
|
|
348
|
-
# These two fields are saved exactly used for training. Even if they map exactly to a custom prompt or generator, those can change, so we want to keep a record of the training prompt.
|
|
349
|
-
system_message: str = Field(
|
|
350
|
-
description="The system message to use for this fine-tune.",
|
|
351
|
-
)
|
|
352
|
-
thinking_instructions: str | None = Field(
|
|
353
|
-
default=None,
|
|
354
|
-
description="The thinking instructions to use for this fine-tune. Only used when data_strategy is final_and_intermediate.",
|
|
355
|
-
)
|
|
356
|
-
latest_status: FineTuneStatusType = Field(
|
|
357
|
-
default=FineTuneStatusType.unknown,
|
|
358
|
-
description="The latest known status of this fine-tune. Not updated in real time.",
|
|
359
|
-
)
|
|
360
|
-
properties: Dict[str, str | int | float] = Field(
|
|
361
|
-
default={},
|
|
362
|
-
description="Properties of the fine-tune. Different providers may use different properties.",
|
|
363
|
-
)
|
|
364
|
-
data_strategy: FinetuneDataStrategy = Field(
|
|
365
|
-
default=FinetuneDataStrategy.final_only,
|
|
366
|
-
description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
def parent_task(self) -> Task | None:
|
|
370
|
-
if not isinstance(self.parent, Task):
|
|
371
|
-
return None
|
|
372
|
-
return self.parent
|
|
373
|
-
|
|
374
|
-
@model_validator(mode="after")
|
|
375
|
-
def validate_thinking_instructions(self) -> Self:
|
|
376
|
-
if (
|
|
377
|
-
self.thinking_instructions is not None
|
|
378
|
-
and self.data_strategy != FinetuneDataStrategy.final_and_intermediate
|
|
379
|
-
):
|
|
380
|
-
raise ValueError(
|
|
381
|
-
"Thinking instructions can only be used when data_strategy is final_and_intermediate"
|
|
382
|
-
)
|
|
383
|
-
if (
|
|
384
|
-
self.thinking_instructions is None
|
|
385
|
-
and self.data_strategy == FinetuneDataStrategy.final_and_intermediate
|
|
386
|
-
):
|
|
387
|
-
raise ValueError(
|
|
388
|
-
"Thinking instructions are required when data_strategy is final_and_intermediate"
|
|
389
|
-
)
|
|
390
|
-
return self
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
class DataSourceType(str, Enum):
|
|
394
|
-
"""
|
|
395
|
-
The source type of a piece of data.
|
|
396
|
-
|
|
397
|
-
Human: a human created the data
|
|
398
|
-
Synthetic: a model created the data
|
|
399
|
-
"""
|
|
400
|
-
|
|
401
|
-
human = "human"
|
|
402
|
-
synthetic = "synthetic"
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
class DataSourceProperty(BaseModel):
|
|
406
|
-
"""
|
|
407
|
-
Defines a property that can be associated with a data source.
|
|
408
|
-
|
|
409
|
-
Includes validation rules for when properties are required or not allowed
|
|
410
|
-
based on the data source type.
|
|
411
|
-
"""
|
|
412
|
-
|
|
413
|
-
name: str
|
|
414
|
-
type: Type[Union[str, int, float]]
|
|
415
|
-
required_for: List[DataSourceType] = []
|
|
416
|
-
not_allowed_for: List[DataSourceType] = []
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
class DataSource(BaseModel):
|
|
420
|
-
"""
|
|
421
|
-
Represents the origin of data, either human or synthetic, with associated properties.
|
|
422
|
-
|
|
423
|
-
Properties vary based on the source type - for synthetic sources this includes
|
|
424
|
-
model information, for human sources this includes creator information.
|
|
425
|
-
"""
|
|
426
|
-
|
|
427
|
-
type: DataSourceType
|
|
428
|
-
properties: Dict[str, str | int | float] = Field(
|
|
429
|
-
default={},
|
|
430
|
-
description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
|
|
431
|
-
)
|
|
432
|
-
|
|
433
|
-
_data_source_properties = [
|
|
434
|
-
DataSourceProperty(
|
|
435
|
-
name="created_by",
|
|
436
|
-
type=str,
|
|
437
|
-
required_for=[DataSourceType.human],
|
|
438
|
-
not_allowed_for=[DataSourceType.synthetic],
|
|
439
|
-
),
|
|
440
|
-
DataSourceProperty(
|
|
441
|
-
name="model_name",
|
|
442
|
-
type=str,
|
|
443
|
-
required_for=[DataSourceType.synthetic],
|
|
444
|
-
not_allowed_for=[DataSourceType.human],
|
|
445
|
-
),
|
|
446
|
-
DataSourceProperty(
|
|
447
|
-
name="model_provider",
|
|
448
|
-
type=str,
|
|
449
|
-
required_for=[DataSourceType.synthetic],
|
|
450
|
-
not_allowed_for=[DataSourceType.human],
|
|
451
|
-
),
|
|
452
|
-
DataSourceProperty(
|
|
453
|
-
name="adapter_name",
|
|
454
|
-
type=str,
|
|
455
|
-
required_for=[DataSourceType.synthetic],
|
|
456
|
-
not_allowed_for=[DataSourceType.human],
|
|
457
|
-
),
|
|
458
|
-
DataSourceProperty(
|
|
459
|
-
name="prompt_builder_name",
|
|
460
|
-
type=str,
|
|
461
|
-
not_allowed_for=[DataSourceType.human],
|
|
462
|
-
),
|
|
463
|
-
DataSourceProperty(
|
|
464
|
-
# Optional: an ID within the scope of the prompt_builder_name.
|
|
465
|
-
# Used for prompt builders with IDs (like saved prompts, fine-tune prompts)
|
|
466
|
-
name="prompt_id",
|
|
467
|
-
type=str,
|
|
468
|
-
not_allowed_for=[DataSourceType.human],
|
|
469
|
-
),
|
|
470
|
-
]
|
|
471
|
-
|
|
472
|
-
@model_validator(mode="after")
|
|
473
|
-
def validate_type(self) -> "DataSource":
|
|
474
|
-
if self.type not in DataSourceType:
|
|
475
|
-
raise ValueError(f"Invalid data source type: {self.type}")
|
|
476
|
-
return self
|
|
477
|
-
|
|
478
|
-
@model_validator(mode="after")
|
|
479
|
-
def validate_properties(self) -> "DataSource":
|
|
480
|
-
for prop in self._data_source_properties:
|
|
481
|
-
# Check the property type is correct
|
|
482
|
-
if prop.name in self.properties:
|
|
483
|
-
if not isinstance(self.properties[prop.name], prop.type):
|
|
484
|
-
raise ValueError(
|
|
485
|
-
f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
|
|
486
|
-
)
|
|
487
|
-
# Check the property is required for the data source type
|
|
488
|
-
if self.type in prop.required_for:
|
|
489
|
-
if prop.name not in self.properties:
|
|
490
|
-
raise ValueError(
|
|
491
|
-
f"'{prop.name}' is required for {self.type} data source"
|
|
492
|
-
)
|
|
493
|
-
# Check the property is not allowed for the data source type
|
|
494
|
-
elif self.type in prop.not_allowed_for and prop.name in self.properties:
|
|
495
|
-
raise ValueError(
|
|
496
|
-
f"'{prop.name}' is not allowed for {self.type} data source"
|
|
497
|
-
)
|
|
498
|
-
return self
|
|
499
|
-
|
|
500
|
-
@model_validator(mode="after")
|
|
501
|
-
def validate_no_empty_properties(self) -> Self:
|
|
502
|
-
for prop, value in self.properties.items():
|
|
503
|
-
if isinstance(value, str) and value == "":
|
|
504
|
-
raise ValueError(
|
|
505
|
-
f"Property '{prop}' must be a non-empty string for {self.type} data source"
|
|
506
|
-
)
|
|
507
|
-
return self
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
class TaskRun(KilnParentedModel):
|
|
511
|
-
"""
|
|
512
|
-
Represents a single execution of a Task.
|
|
513
|
-
|
|
514
|
-
Contains the input used, its source, the output produced, and optional
|
|
515
|
-
repair information if the output needed correction.
|
|
516
|
-
"""
|
|
517
|
-
|
|
518
|
-
input: str = Field(
|
|
519
|
-
description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
520
|
-
)
|
|
521
|
-
input_source: DataSource | None = Field(
|
|
522
|
-
default=None, description="The source of the input: human or synthetic."
|
|
523
|
-
)
|
|
524
|
-
|
|
525
|
-
output: TaskOutput = Field(description="The output of the task run.")
|
|
526
|
-
repair_instructions: str | None = Field(
|
|
527
|
-
default=None,
|
|
528
|
-
description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
|
|
529
|
-
)
|
|
530
|
-
repaired_output: TaskOutput | None = Field(
|
|
531
|
-
default=None,
|
|
532
|
-
description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
|
|
533
|
-
)
|
|
534
|
-
intermediate_outputs: Dict[str, str] | None = Field(
|
|
535
|
-
default=None,
|
|
536
|
-
description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
|
|
537
|
-
)
|
|
538
|
-
tags: List[str] = Field(
|
|
539
|
-
default=[],
|
|
540
|
-
description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
def has_thinking_training_data(self) -> bool:
|
|
544
|
-
"""
|
|
545
|
-
Does this run have thinking data that we can use to train a thinking model?
|
|
546
|
-
"""
|
|
547
|
-
if self.intermediate_outputs is None:
|
|
548
|
-
return False
|
|
549
|
-
return (
|
|
550
|
-
"chain_of_thought" in self.intermediate_outputs
|
|
551
|
-
or "reasoning" in self.intermediate_outputs
|
|
552
|
-
)
|
|
553
|
-
|
|
554
|
-
def parent_task(self) -> Task | None:
|
|
555
|
-
if not isinstance(self.parent, Task):
|
|
556
|
-
return None
|
|
557
|
-
return self.parent
|
|
558
|
-
|
|
559
|
-
@model_validator(mode="after")
|
|
560
|
-
def validate_input_format(self, info: ValidationInfo) -> Self:
|
|
561
|
-
# Don't validate if loading from file (not new). Too slow.
|
|
562
|
-
# We don't allow changing task schema, so this is redundant validation.
|
|
563
|
-
# Note: we still validate if editing a loaded model
|
|
564
|
-
if self.loading_from_file(info):
|
|
565
|
-
# Consider loading an existing model as validated.
|
|
566
|
-
self._last_validated_input = self.input
|
|
567
|
-
return self
|
|
568
|
-
|
|
569
|
-
# Don't validate if input has not changed. Too slow to run this every time.
|
|
570
|
-
if (
|
|
571
|
-
hasattr(self, "_last_validated_input")
|
|
572
|
-
and self.input == self._last_validated_input
|
|
573
|
-
):
|
|
574
|
-
return self
|
|
575
|
-
|
|
576
|
-
task = self.parent_task()
|
|
577
|
-
if task is None:
|
|
578
|
-
# don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
|
|
579
|
-
return self
|
|
580
|
-
|
|
581
|
-
# validate output
|
|
582
|
-
if task.input_json_schema is not None:
|
|
583
|
-
try:
|
|
584
|
-
validate_schema(json.loads(self.input), task.input_json_schema)
|
|
585
|
-
except json.JSONDecodeError:
|
|
586
|
-
raise ValueError("Input is not a valid JSON object")
|
|
587
|
-
except jsonschema.exceptions.ValidationError as e:
|
|
588
|
-
raise ValueError(f"Input does not match task input schema: {e}")
|
|
589
|
-
self._last_validated_input = self.input
|
|
590
|
-
return self
|
|
591
|
-
|
|
592
|
-
@model_validator(mode="after")
|
|
593
|
-
def validate_output_format(self, info: ValidationInfo) -> Self:
|
|
594
|
-
# Don't validate if loading from file (not new). Too slow.
|
|
595
|
-
# Note: we still validate if editing a loaded model's output.
|
|
596
|
-
if self.loading_from_file(info):
|
|
597
|
-
# Consider loading an existing model as validated.
|
|
598
|
-
self._last_validated_output = self.output.output if self.output else None
|
|
599
|
-
return self
|
|
600
|
-
|
|
601
|
-
# Don't validate unless output has changed since last validation.
|
|
602
|
-
# The validator is slow and costly, don't want it running when setting other fields.
|
|
603
|
-
if (
|
|
604
|
-
hasattr(self, "_last_validated_output")
|
|
605
|
-
and self.output is not None
|
|
606
|
-
and self.output.output == self._last_validated_output
|
|
607
|
-
):
|
|
608
|
-
return self
|
|
609
|
-
|
|
610
|
-
task = self.parent_task()
|
|
611
|
-
if task is None:
|
|
612
|
-
return self
|
|
613
|
-
|
|
614
|
-
self.output.validate_output_format(task)
|
|
615
|
-
self._last_validated_output = self.output.output if self.output else None
|
|
616
|
-
return self
|
|
617
|
-
|
|
618
|
-
@model_validator(mode="after")
|
|
619
|
-
def validate_repaired_output(self) -> Self:
|
|
620
|
-
if self.repaired_output is not None:
|
|
621
|
-
if self.repaired_output.rating is not None:
|
|
622
|
-
raise ValueError(
|
|
623
|
-
"Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
|
|
624
|
-
)
|
|
625
|
-
if self.repair_instructions is None and self.repaired_output is not None:
|
|
626
|
-
raise ValueError(
|
|
627
|
-
"Repair instructions are required if providing a repaired output."
|
|
628
|
-
)
|
|
629
|
-
if self.repair_instructions is not None and self.repaired_output is None:
|
|
630
|
-
raise ValueError(
|
|
631
|
-
"A repaired output is required if providing repair instructions."
|
|
632
|
-
)
|
|
633
|
-
return self
|
|
634
|
-
|
|
635
|
-
@model_validator(mode="after")
|
|
636
|
-
def validate_input_source(self, info: ValidationInfo) -> Self:
|
|
637
|
-
# On strict mode and not loaded from file, we validate input_source is not None.
|
|
638
|
-
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
639
|
-
if not strict_mode():
|
|
640
|
-
return self
|
|
641
|
-
if self.loaded_from_file(info):
|
|
642
|
-
return self
|
|
643
|
-
if self.input_source is None:
|
|
644
|
-
raise ValueError("input_source is required when strict mode is enabled")
|
|
645
|
-
return self
|
|
646
|
-
|
|
647
|
-
@model_validator(mode="after")
|
|
648
|
-
def validate_tags(self) -> Self:
|
|
649
|
-
for tag in self.tags:
|
|
650
|
-
if not tag:
|
|
651
|
-
raise ValueError("Tags cannot be empty strings")
|
|
652
|
-
if " " in tag:
|
|
653
|
-
raise ValueError("Tags cannot contain spaces. Try underscores.")
|
|
654
|
-
|
|
655
|
-
return self
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
# Define the type alias for clarity
|
|
659
|
-
DatasetFilter = Callable[[TaskRun], bool]
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
def AllDatasetFilter(_: TaskRun) -> bool:
|
|
663
|
-
return True
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
|
|
667
|
-
if task_run.output is None:
|
|
668
|
-
return False
|
|
669
|
-
if task_run.repaired_output is not None:
|
|
670
|
-
# Repairs always considered high quality
|
|
671
|
-
return True
|
|
672
|
-
if task_run.output.rating is None:
|
|
673
|
-
return False
|
|
674
|
-
return task_run.output.rating.is_high_quality()
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool:
|
|
678
|
-
"""
|
|
679
|
-
A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought)
|
|
680
|
-
"""
|
|
681
|
-
return task_run.has_thinking_training_data()
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool:
|
|
685
|
-
"""
|
|
686
|
-
A filter that returns True if the task has thinking data and the output is high quality
|
|
687
|
-
"""
|
|
688
|
-
return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run)
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
class DatasetFilterType(str, Enum):
|
|
692
|
-
"""Dataset filter names."""
|
|
693
|
-
|
|
694
|
-
ALL = "all"
|
|
695
|
-
HIGH_RATING = "high_rating"
|
|
696
|
-
THINKING_MODEL = "thinking_model"
|
|
697
|
-
THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated"
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
dataset_filters = {
|
|
701
|
-
DatasetFilterType.ALL: AllDatasetFilter,
|
|
702
|
-
DatasetFilterType.HIGH_RATING: HighRatingDatasetFilter,
|
|
703
|
-
DatasetFilterType.THINKING_MODEL: ThinkingModelDatasetFilter,
|
|
704
|
-
DatasetFilterType.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter,
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
class DatasetSplitDefinition(BaseModel):
|
|
709
|
-
"""
|
|
710
|
-
A definition of a split in a dataset.
|
|
711
|
-
|
|
712
|
-
Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
|
|
713
|
-
"""
|
|
714
|
-
|
|
715
|
-
name: str = NAME_FIELD
|
|
716
|
-
description: str | None = Field(
|
|
717
|
-
default=None,
|
|
718
|
-
description="A description of the dataset for you and your team. Not used in training.",
|
|
719
|
-
)
|
|
720
|
-
percentage: float = Field(
|
|
721
|
-
ge=0.0,
|
|
722
|
-
le=1.0,
|
|
723
|
-
description="The percentage of the dataset that this split represents (between 0 and 1).",
|
|
724
|
-
)
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
AllSplitDefinition: list[DatasetSplitDefinition] = [
|
|
728
|
-
DatasetSplitDefinition(name="all", percentage=1.0)
|
|
729
|
-
]
|
|
730
|
-
Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
731
|
-
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
732
|
-
DatasetSplitDefinition(name="test", percentage=0.2),
|
|
733
|
-
]
|
|
734
|
-
Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
735
|
-
DatasetSplitDefinition(name="train", percentage=0.6),
|
|
736
|
-
DatasetSplitDefinition(name="test", percentage=0.2),
|
|
737
|
-
DatasetSplitDefinition(name="val", percentage=0.2),
|
|
738
|
-
]
|
|
739
|
-
Train80Test10Val10SplitDefinition: list[DatasetSplitDefinition] = [
|
|
740
|
-
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
741
|
-
DatasetSplitDefinition(name="test", percentage=0.1),
|
|
742
|
-
DatasetSplitDefinition(name="val", percentage=0.1),
|
|
743
|
-
]
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
class DatasetSplit(KilnParentedModel):
|
|
747
|
-
"""
|
|
748
|
-
A collection of task runs, with optional splits (train, test, validation).
|
|
749
|
-
|
|
750
|
-
Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
|
|
751
|
-
|
|
752
|
-
Maintains a list of IDs for each split, to avoid data duplication.
|
|
753
|
-
"""
|
|
754
|
-
|
|
755
|
-
name: str = NAME_FIELD
|
|
756
|
-
description: str | None = Field(
|
|
757
|
-
default=None,
|
|
758
|
-
description="A description of the dataset for you and your team. Not used in training.",
|
|
759
|
-
)
|
|
760
|
-
splits: list[DatasetSplitDefinition] = Field(
|
|
761
|
-
default_factory=list,
|
|
762
|
-
description="The splits in the dataset.",
|
|
763
|
-
)
|
|
764
|
-
split_contents: dict[str, list[str]] = Field(
|
|
765
|
-
description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
|
|
766
|
-
)
|
|
767
|
-
filter: DatasetFilterType | None = Field(
|
|
768
|
-
default=None,
|
|
769
|
-
description="The filter used to build the dataset.",
|
|
770
|
-
)
|
|
771
|
-
|
|
772
|
-
@model_validator(mode="after")
|
|
773
|
-
def validate_split_percentages(self) -> "DatasetSplit":
|
|
774
|
-
total = sum(split.percentage for split in self.splits)
|
|
775
|
-
if not math.isclose(total, 1.0, rel_tol=1e-9):
|
|
776
|
-
raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
|
|
777
|
-
return self
|
|
778
|
-
|
|
779
|
-
@classmethod
|
|
780
|
-
def from_task(
|
|
781
|
-
cls,
|
|
782
|
-
name: str,
|
|
783
|
-
task: "Task",
|
|
784
|
-
splits: list[DatasetSplitDefinition],
|
|
785
|
-
filter_type: DatasetFilterType = DatasetFilterType.ALL,
|
|
786
|
-
description: str | None = None,
|
|
787
|
-
):
|
|
788
|
-
"""
|
|
789
|
-
Build a dataset split from a task.
|
|
790
|
-
"""
|
|
791
|
-
filter = dataset_filters[filter_type]
|
|
792
|
-
split_contents = cls.build_split_contents(task, splits, filter)
|
|
793
|
-
return cls(
|
|
794
|
-
parent=task,
|
|
795
|
-
name=name,
|
|
796
|
-
description=description,
|
|
797
|
-
splits=splits,
|
|
798
|
-
split_contents=split_contents,
|
|
799
|
-
filter=filter_type,
|
|
800
|
-
)
|
|
801
|
-
|
|
802
|
-
@classmethod
|
|
803
|
-
def build_split_contents(
|
|
804
|
-
cls,
|
|
805
|
-
task: "Task",
|
|
806
|
-
splits: list[DatasetSplitDefinition],
|
|
807
|
-
filter: DatasetFilter,
|
|
808
|
-
) -> dict[str, list[str]]:
|
|
809
|
-
valid_ids = []
|
|
810
|
-
for task_run in task.runs():
|
|
811
|
-
if filter(task_run):
|
|
812
|
-
valid_ids.append(task_run.id)
|
|
813
|
-
|
|
814
|
-
# Shuffle and split by split percentage
|
|
815
|
-
random.shuffle(valid_ids)
|
|
816
|
-
split_contents = {}
|
|
817
|
-
start_idx = 0
|
|
818
|
-
remaining_items = len(valid_ids)
|
|
819
|
-
|
|
820
|
-
# Handle all splits except the last one
|
|
821
|
-
for split in splits[:-1]:
|
|
822
|
-
split_size = round(len(valid_ids) * split.percentage)
|
|
823
|
-
split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
|
|
824
|
-
start_idx += split_size
|
|
825
|
-
remaining_items -= split_size
|
|
826
|
-
|
|
827
|
-
# Last split gets all remaining items (for rounding)
|
|
828
|
-
if splits:
|
|
829
|
-
split_contents[splits[-1].name] = valid_ids[start_idx:]
|
|
830
|
-
|
|
831
|
-
return split_contents
|
|
832
|
-
|
|
833
|
-
def parent_task(self) -> "Task | None":
|
|
834
|
-
# inline import to avoid circular import
|
|
835
|
-
from kiln_ai.datamodel import Task
|
|
836
|
-
|
|
837
|
-
if not isinstance(self.parent, Task):
|
|
838
|
-
return None
|
|
839
|
-
return self.parent
|
|
840
|
-
|
|
841
|
-
def missing_count(self) -> int:
|
|
842
|
-
"""
|
|
843
|
-
Returns:
|
|
844
|
-
int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
|
|
845
|
-
"""
|
|
846
|
-
parent = self.parent_task()
|
|
847
|
-
if parent is None:
|
|
848
|
-
raise ValueError("DatasetSplit has no parent task")
|
|
849
|
-
|
|
850
|
-
runs = parent.runs(readonly=True)
|
|
851
|
-
all_ids = set(run.id for run in runs)
|
|
852
|
-
all_ids_in_splits = set()
|
|
853
|
-
for ids in self.split_contents.values():
|
|
854
|
-
all_ids_in_splits.update(ids)
|
|
855
|
-
missing = all_ids_in_splits - all_ids
|
|
856
|
-
return len(missing)
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
class Prompt(KilnParentedModel):
|
|
860
|
-
"""
|
|
861
|
-
A prompt for a task.
|
|
862
|
-
"""
|
|
863
|
-
|
|
864
|
-
name: str = NAME_FIELD
|
|
865
|
-
prompt: str = Field(
|
|
866
|
-
description="The prompt for the task.",
|
|
867
|
-
min_length=1,
|
|
868
|
-
)
|
|
869
|
-
chain_of_thought_instructions: str | None = Field(
|
|
870
|
-
default=None,
|
|
871
|
-
description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.",
|
|
872
|
-
)
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
class TaskRequirement(BaseModel):
|
|
876
|
-
"""
|
|
877
|
-
Defines a specific requirement that should be met by task outputs.
|
|
878
|
-
|
|
879
|
-
Includes an identifier, name, description, instruction for meeting the requirement,
|
|
880
|
-
priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
|
|
881
|
-
"""
|
|
882
|
-
|
|
883
|
-
id: ID_TYPE = ID_FIELD
|
|
884
|
-
name: str = SHORT_NAME_FIELD
|
|
885
|
-
description: str | None = Field(default=None)
|
|
886
|
-
instruction: str = Field(min_length=1)
|
|
887
|
-
priority: Priority = Field(default=Priority.p2)
|
|
888
|
-
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
class TaskDeterminism(str, Enum):
|
|
892
|
-
"""
|
|
893
|
-
Defines how strictly task outputs should match expected results.
|
|
894
|
-
|
|
895
|
-
- deterministic: Requires exact matches
|
|
896
|
-
- semantic_match: Allows different wording with same meaning
|
|
897
|
-
- flexible: Allows variation in both wording and meaning within requirements
|
|
898
|
-
"""
|
|
899
|
-
|
|
900
|
-
deterministic = "deterministic" # Expect exact match
|
|
901
|
-
semantic_match = "semantic_match" # Expect same meaning, but flexible on expression of the meaning
|
|
902
|
-
flexible = "flexible" # Flexible on semantic output. Eval should be custom based on parsing requirements.
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
class Task(
|
|
906
|
-
KilnParentedModel,
|
|
907
|
-
KilnParentModel,
|
|
908
|
-
parent_of={
|
|
909
|
-
"runs": TaskRun,
|
|
910
|
-
"dataset_splits": DatasetSplit,
|
|
911
|
-
"finetunes": Finetune,
|
|
912
|
-
"prompts": Prompt,
|
|
913
|
-
},
|
|
914
|
-
):
|
|
915
|
-
"""
|
|
916
|
-
Represents a specific task to be performed, with associated requirements and validation rules.
|
|
917
|
-
|
|
918
|
-
Contains the task definition, requirements, input/output schemas, and maintains
|
|
919
|
-
a collection of task runs.
|
|
920
|
-
"""
|
|
921
|
-
|
|
922
|
-
name: str = NAME_FIELD
|
|
923
|
-
description: str | None = Field(
|
|
924
|
-
default=None,
|
|
925
|
-
description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
|
|
926
|
-
)
|
|
927
|
-
instruction: str = Field(
|
|
928
|
-
min_length=1,
|
|
929
|
-
description="The instructions for the task. Will be used in prompts/training/validation.",
|
|
930
|
-
)
|
|
931
|
-
requirements: List[TaskRequirement] = Field(default=[])
|
|
932
|
-
output_json_schema: JsonObjectSchema | None = None
|
|
933
|
-
input_json_schema: JsonObjectSchema | None = None
|
|
934
|
-
thinking_instruction: str | None = Field(
|
|
935
|
-
default=None,
|
|
936
|
-
description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
|
|
937
|
-
)
|
|
938
|
-
|
|
939
|
-
def output_schema(self) -> Dict | None:
|
|
940
|
-
if self.output_json_schema is None:
|
|
941
|
-
return None
|
|
942
|
-
return schema_from_json_str(self.output_json_schema)
|
|
943
|
-
|
|
944
|
-
def input_schema(self) -> Dict | None:
|
|
945
|
-
if self.input_json_schema is None:
|
|
946
|
-
return None
|
|
947
|
-
return schema_from_json_str(self.input_json_schema)
|
|
948
|
-
|
|
949
|
-
# These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
|
|
950
|
-
def runs(self, readonly: bool = False) -> list[TaskRun]:
|
|
951
|
-
return super().runs(readonly=readonly) # type: ignore
|
|
952
|
-
|
|
953
|
-
def dataset_splits(self, readonly: bool = False) -> list[DatasetSplit]:
|
|
954
|
-
return super().dataset_splits(readonly=readonly) # type: ignore
|
|
955
|
-
|
|
956
|
-
def finetunes(self, readonly: bool = False) -> list[Finetune]:
|
|
957
|
-
return super().finetunes(readonly=readonly) # type: ignore
|
|
958
|
-
|
|
959
|
-
def prompts(self, readonly: bool = False) -> list[Prompt]:
|
|
960
|
-
return super().prompts(readonly=readonly) # type: ignore
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
class Project(KilnParentModel, parent_of={"tasks": Task}):
|
|
964
|
-
"""
|
|
965
|
-
A collection of related tasks.
|
|
966
|
-
|
|
967
|
-
Projects organize tasks into logical groups and provide high-level descriptions
|
|
968
|
-
of the overall goals.
|
|
969
|
-
"""
|
|
970
|
-
|
|
971
|
-
name: str = NAME_FIELD
|
|
972
|
-
description: str | None = Field(
|
|
973
|
-
default=None,
|
|
974
|
-
description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
|
|
975
|
-
)
|
|
976
|
-
|
|
977
|
-
# Needed for typechecking. TODO P2: fix this in KilnParentModel
|
|
978
|
-
def tasks(self) -> list[Task]:
|
|
979
|
-
return super().tasks() # type: ignore
|