kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +4 -0
- kiln_ai/adapters/adapter_registry.py +163 -39
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/__init__.py +28 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +270 -0
- kiln_ai/adapters/eval/g_eval.py +368 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +325 -0
- kiln_ai/adapters/eval/test_eval_runner.py +641 -0
- kiln_ai/adapters/eval/test_g_eval.py +498 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
- kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
- kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
- kiln_ai/adapters/ml_model_list.py +758 -163
- kiln_ai/adapters/model_adapters/__init__.py +2 -4
- kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
- kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
- kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
- kiln_ai/adapters/ollama_tools.py +3 -3
- kiln_ai/adapters/parsers/r1_parser.py +19 -14
- kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/provider_tools.py +50 -58
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +6 -6
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +26 -29
- kiln_ai/adapters/test_generate_docs.py +4 -4
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +47 -33
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/adapters/test_provider_tools.py +26 -81
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/basemodel.py +2 -0
- kiln_ai/datamodel/datamodel_enums.py +60 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +7 -1
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +328 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +19 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +22 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +43 -1
- kiln_ai/utils/dataset_import.py +232 -0
- kiln_ai/utils/test_dataset_import.py +596 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
- kiln_ai-0.13.0.dist-info/RECORD +103 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Type, Union
|
|
4
|
+
|
|
5
|
+
import jsonschema
|
|
6
|
+
import jsonschema.exceptions
|
|
7
|
+
from pydantic import BaseModel, Field, ValidationInfo, model_validator
|
|
8
|
+
from typing_extensions import Self
|
|
9
|
+
|
|
10
|
+
from kiln_ai.datamodel.basemodel import ID_TYPE, KilnBaseModel
|
|
11
|
+
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
|
|
12
|
+
from kiln_ai.datamodel.json_schema import validate_schema
|
|
13
|
+
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
14
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from kiln_ai.datamodel.task import Task
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RequirementRating(BaseModel):
|
|
21
|
+
"""Rating for a specific requirement within a task output."""
|
|
22
|
+
|
|
23
|
+
value: float = Field(
|
|
24
|
+
description="The rating value. Interpretation depends on rating type"
|
|
25
|
+
)
|
|
26
|
+
type: TaskOutputRatingType = Field(description="The type of rating")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_rating(rating: float, rating_type: TaskOutputRatingType) -> float:
|
|
30
|
+
"""Normalize a rating to a 0-1 scale. Simple normalization, not z-score."""
|
|
31
|
+
match rating_type:
|
|
32
|
+
case TaskOutputRatingType.five_star:
|
|
33
|
+
if rating < 1 or rating > 5:
|
|
34
|
+
raise ValueError("Five star rating must be between 1 and 5")
|
|
35
|
+
return (rating - 1) / 4
|
|
36
|
+
case TaskOutputRatingType.pass_fail:
|
|
37
|
+
if rating < 0 or rating > 1:
|
|
38
|
+
raise ValueError("Pass fail rating must 0 to 1")
|
|
39
|
+
return rating
|
|
40
|
+
case TaskOutputRatingType.pass_fail_critical:
|
|
41
|
+
if rating < -1 or rating > 1:
|
|
42
|
+
raise ValueError("Pass fail critical rating must -1 to 1")
|
|
43
|
+
return (rating + 1) / 2 # -1 to 1
|
|
44
|
+
case TaskOutputRatingType.custom:
|
|
45
|
+
raise ValueError("Custom rating type can not be normalized")
|
|
46
|
+
case _:
|
|
47
|
+
raise_exhaustive_enum_error(rating_type)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TaskOutputRating(KilnBaseModel):
|
|
51
|
+
"""
|
|
52
|
+
A rating for a task output, including an overall rating and ratings for each requirement.
|
|
53
|
+
|
|
54
|
+
Supports:
|
|
55
|
+
- five_star: 1-5 star ratings
|
|
56
|
+
- pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
|
|
57
|
+
- pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
61
|
+
value: float | None = Field(
|
|
62
|
+
description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
|
|
63
|
+
default=None,
|
|
64
|
+
)
|
|
65
|
+
requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
|
|
66
|
+
default={},
|
|
67
|
+
description="The ratings of the requirements of the task.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
|
|
71
|
+
@model_validator(mode="before")
|
|
72
|
+
def upgrade_old_format(cls, data: dict) -> dict:
|
|
73
|
+
if not isinstance(data, dict):
|
|
74
|
+
return data
|
|
75
|
+
|
|
76
|
+
# Check if we have the old format (dict of floats)
|
|
77
|
+
req_ratings = data.get("requirement_ratings", {})
|
|
78
|
+
if req_ratings and all(
|
|
79
|
+
isinstance(v, (int, float)) for v in req_ratings.values()
|
|
80
|
+
):
|
|
81
|
+
# Convert each float to a RequirementRating object
|
|
82
|
+
# all ratings are five star at the point we used this format
|
|
83
|
+
data["requirement_ratings"] = {
|
|
84
|
+
k: {"value": v, "type": TaskOutputRatingType.five_star}
|
|
85
|
+
for k, v in req_ratings.items()
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return data
|
|
89
|
+
|
|
90
|
+
# Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
|
|
91
|
+
def is_high_quality(self) -> bool:
|
|
92
|
+
if self.value is None:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
if self.type == TaskOutputRatingType.five_star:
|
|
96
|
+
return self.value >= 4
|
|
97
|
+
elif self.type == TaskOutputRatingType.pass_fail:
|
|
98
|
+
return self.value == 1.0
|
|
99
|
+
elif self.type == TaskOutputRatingType.pass_fail_critical:
|
|
100
|
+
return self.value == 1.0
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
@model_validator(mode="after")
|
|
104
|
+
def validate_rating(self) -> Self:
|
|
105
|
+
if self.type not in TaskOutputRatingType:
|
|
106
|
+
raise ValueError(f"Invalid rating type: {self.type}")
|
|
107
|
+
|
|
108
|
+
# Overall rating is optional
|
|
109
|
+
if self.value is not None:
|
|
110
|
+
self._validate_rating(self.type, self.value, "overall rating")
|
|
111
|
+
|
|
112
|
+
for req_id, req_rating in self.requirement_ratings.items():
|
|
113
|
+
self._validate_rating(
|
|
114
|
+
req_rating.type,
|
|
115
|
+
req_rating.value,
|
|
116
|
+
f"requirement rating for req ID: {req_id}",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return self
|
|
120
|
+
|
|
121
|
+
def _validate_rating(
|
|
122
|
+
self, type: TaskOutputRatingType, rating: float | None, rating_name: str
|
|
123
|
+
) -> None:
|
|
124
|
+
if type == TaskOutputRatingType.five_star:
|
|
125
|
+
self._validate_five_star(rating, rating_name)
|
|
126
|
+
elif type == TaskOutputRatingType.pass_fail:
|
|
127
|
+
self._validate_pass_fail(rating, rating_name)
|
|
128
|
+
elif type == TaskOutputRatingType.pass_fail_critical:
|
|
129
|
+
self._validate_pass_fail_critical(rating, rating_name)
|
|
130
|
+
|
|
131
|
+
def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
|
|
132
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
|
|
135
|
+
)
|
|
136
|
+
if rating < 1 or rating > 5:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
|
|
142
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
|
|
145
|
+
)
|
|
146
|
+
if rating not in [0, 1]:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _validate_pass_fail_critical(
|
|
152
|
+
self, rating: float | None, rating_name: str
|
|
153
|
+
) -> None:
|
|
154
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
|
|
157
|
+
)
|
|
158
|
+
if rating not in [-1, 0, 1]:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class DataSourceType(str, Enum):
|
|
165
|
+
"""
|
|
166
|
+
The source type of a piece of data.
|
|
167
|
+
|
|
168
|
+
Human: a human created the data
|
|
169
|
+
Synthetic: a model created the data
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
human = "human"
|
|
173
|
+
synthetic = "synthetic"
|
|
174
|
+
file_import = "file_import"
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class DataSourceProperty(BaseModel):
|
|
178
|
+
"""
|
|
179
|
+
Defines a property that can be associated with a data source.
|
|
180
|
+
|
|
181
|
+
Includes validation rules for when properties are required or not allowed
|
|
182
|
+
based on the data source type.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
name: str
|
|
186
|
+
type: Type[Union[str, int, float]]
|
|
187
|
+
required_for: List[DataSourceType] = []
|
|
188
|
+
not_allowed_for: List[DataSourceType] = []
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class DataSource(BaseModel):
|
|
192
|
+
"""
|
|
193
|
+
Represents the origin of data, either human or synthetic, with associated properties.
|
|
194
|
+
|
|
195
|
+
Properties vary based on the source type - for synthetic sources this includes
|
|
196
|
+
model information, for human sources this includes creator information.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
type: DataSourceType
|
|
200
|
+
properties: Dict[str, str | int | float] = Field(
|
|
201
|
+
default={},
|
|
202
|
+
description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
_data_source_properties = [
|
|
206
|
+
DataSourceProperty(
|
|
207
|
+
name="created_by",
|
|
208
|
+
type=str,
|
|
209
|
+
required_for=[DataSourceType.human],
|
|
210
|
+
not_allowed_for=[DataSourceType.synthetic, DataSourceType.file_import],
|
|
211
|
+
),
|
|
212
|
+
DataSourceProperty(
|
|
213
|
+
name="model_name",
|
|
214
|
+
type=str,
|
|
215
|
+
required_for=[DataSourceType.synthetic],
|
|
216
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
|
|
217
|
+
),
|
|
218
|
+
DataSourceProperty(
|
|
219
|
+
name="model_provider",
|
|
220
|
+
type=str,
|
|
221
|
+
required_for=[DataSourceType.synthetic],
|
|
222
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
|
|
223
|
+
),
|
|
224
|
+
DataSourceProperty(
|
|
225
|
+
name="adapter_name",
|
|
226
|
+
type=str,
|
|
227
|
+
required_for=[DataSourceType.synthetic],
|
|
228
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
|
|
229
|
+
),
|
|
230
|
+
DataSourceProperty(
|
|
231
|
+
# Legacy field -- allow loading from old runs, but we shouldn't be setting it.
|
|
232
|
+
name="prompt_builder_name",
|
|
233
|
+
type=str,
|
|
234
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
|
|
235
|
+
),
|
|
236
|
+
DataSourceProperty(
|
|
237
|
+
# The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details.
|
|
238
|
+
name="prompt_id",
|
|
239
|
+
type=str,
|
|
240
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.file_import],
|
|
241
|
+
),
|
|
242
|
+
DataSourceProperty(
|
|
243
|
+
name="file_name",
|
|
244
|
+
type=str,
|
|
245
|
+
required_for=[DataSourceType.file_import],
|
|
246
|
+
not_allowed_for=[DataSourceType.human, DataSourceType.synthetic],
|
|
247
|
+
),
|
|
248
|
+
]
|
|
249
|
+
|
|
250
|
+
@model_validator(mode="after")
|
|
251
|
+
def validate_type(self) -> "DataSource":
|
|
252
|
+
if self.type not in DataSourceType:
|
|
253
|
+
raise ValueError(f"Invalid data source type: {self.type}")
|
|
254
|
+
return self
|
|
255
|
+
|
|
256
|
+
@model_validator(mode="after")
|
|
257
|
+
def validate_properties(self) -> "DataSource":
|
|
258
|
+
for prop in self._data_source_properties:
|
|
259
|
+
# Check the property type is correct
|
|
260
|
+
if prop.name in self.properties:
|
|
261
|
+
if not isinstance(self.properties[prop.name], prop.type):
|
|
262
|
+
raise ValueError(
|
|
263
|
+
f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
|
|
264
|
+
)
|
|
265
|
+
# Check the property is required for the data source type
|
|
266
|
+
if self.type in prop.required_for:
|
|
267
|
+
if prop.name not in self.properties:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"'{prop.name}' is required for {self.type} data source"
|
|
270
|
+
)
|
|
271
|
+
# Check the property is not allowed for the data source type
|
|
272
|
+
elif self.type in prop.not_allowed_for and prop.name in self.properties:
|
|
273
|
+
raise ValueError(
|
|
274
|
+
f"'{prop.name}' is not allowed for {self.type} data source"
|
|
275
|
+
)
|
|
276
|
+
return self
|
|
277
|
+
|
|
278
|
+
@model_validator(mode="after")
|
|
279
|
+
def validate_no_empty_properties(self) -> Self:
|
|
280
|
+
for prop, value in self.properties.items():
|
|
281
|
+
if isinstance(value, str) and value == "":
|
|
282
|
+
raise ValueError(
|
|
283
|
+
f"Property '{prop}' must be a non-empty string for {self.type} data source"
|
|
284
|
+
)
|
|
285
|
+
return self
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class TaskOutput(KilnBaseModel):
|
|
289
|
+
"""
|
|
290
|
+
An output for a specific task run.
|
|
291
|
+
|
|
292
|
+
Contains the actual output content, its source (human or synthetic),
|
|
293
|
+
and optional rating information.
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
output: str = Field(
|
|
297
|
+
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
298
|
+
)
|
|
299
|
+
source: DataSource | None = Field(
|
|
300
|
+
description="The source of the output: human or synthetic.",
|
|
301
|
+
default=None,
|
|
302
|
+
)
|
|
303
|
+
rating: TaskOutputRating | None = Field(
|
|
304
|
+
default=None, description="The rating of the output"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def validate_output_format(self, task: "Task") -> Self:
|
|
308
|
+
# validate output
|
|
309
|
+
if task.output_json_schema is not None:
|
|
310
|
+
try:
|
|
311
|
+
validate_schema(json.loads(self.output), task.output_json_schema)
|
|
312
|
+
except json.JSONDecodeError:
|
|
313
|
+
raise ValueError("Output is not a valid JSON object")
|
|
314
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
315
|
+
raise ValueError(f"Output does not match task output schema: {e}")
|
|
316
|
+
return self
|
|
317
|
+
|
|
318
|
+
@model_validator(mode="after")
|
|
319
|
+
def validate_output_source(self, info: ValidationInfo) -> Self:
|
|
320
|
+
# On strict mode and not loaded from file, we validate output_source is not None.
|
|
321
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
322
|
+
if not strict_mode():
|
|
323
|
+
return self
|
|
324
|
+
if self.loaded_from_file(info):
|
|
325
|
+
return self
|
|
326
|
+
if self.source is None:
|
|
327
|
+
raise ValueError("Output source is required when strict mode is enabled")
|
|
328
|
+
return self
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
import jsonschema
|
|
5
|
+
import jsonschema.exceptions
|
|
6
|
+
from pydantic import Field, ValidationInfo, model_validator
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from kiln_ai.datamodel.basemodel import KilnParentedModel
|
|
10
|
+
from kiln_ai.datamodel.json_schema import validate_schema
|
|
11
|
+
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
12
|
+
from kiln_ai.datamodel.task_output import DataSource, TaskOutput
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from kiln_ai.datamodel.task import Task
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TaskRun(KilnParentedModel):
|
|
19
|
+
"""
|
|
20
|
+
Represents a single execution of a Task.
|
|
21
|
+
|
|
22
|
+
Contains the input used, its source, the output produced, and optional
|
|
23
|
+
repair information if the output needed correction.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
input: str = Field(
|
|
27
|
+
description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
28
|
+
)
|
|
29
|
+
input_source: DataSource | None = Field(
|
|
30
|
+
default=None, description="The source of the input: human or synthetic."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
output: TaskOutput = Field(description="The output of the task run.")
|
|
34
|
+
repair_instructions: str | None = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
|
|
37
|
+
)
|
|
38
|
+
repaired_output: TaskOutput | None = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
|
|
41
|
+
)
|
|
42
|
+
intermediate_outputs: Dict[str, str] | None = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
|
|
45
|
+
)
|
|
46
|
+
tags: List[str] = Field(
|
|
47
|
+
default=[],
|
|
48
|
+
description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def has_thinking_training_data(self) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Does this run have thinking data that we can use to train a thinking model?
|
|
54
|
+
"""
|
|
55
|
+
if self.intermediate_outputs is None:
|
|
56
|
+
return False
|
|
57
|
+
return (
|
|
58
|
+
"chain_of_thought" in self.intermediate_outputs
|
|
59
|
+
or "reasoning" in self.intermediate_outputs
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Workaround to return typed parent without importing Task
|
|
63
|
+
def parent_task(self) -> Union["Task", None]:
|
|
64
|
+
if self.parent is None or self.parent.__class__.__name__ != "Task":
|
|
65
|
+
return None
|
|
66
|
+
return self.parent # type: ignore
|
|
67
|
+
|
|
68
|
+
@model_validator(mode="after")
|
|
69
|
+
def validate_input_format(self, info: ValidationInfo) -> Self:
|
|
70
|
+
# Don't validate if loading from file (not new). Too slow.
|
|
71
|
+
# We don't allow changing task schema, so this is redundant validation.
|
|
72
|
+
# Note: we still validate if editing a loaded model
|
|
73
|
+
if self.loading_from_file(info):
|
|
74
|
+
# Consider loading an existing model as validated.
|
|
75
|
+
self._last_validated_input = self.input
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
# Don't validate if input has not changed. Too slow to run this every time.
|
|
79
|
+
if (
|
|
80
|
+
hasattr(self, "_last_validated_input")
|
|
81
|
+
and self.input == self._last_validated_input
|
|
82
|
+
):
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
task = self.parent_task()
|
|
86
|
+
if task is None:
|
|
87
|
+
# don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
# validate output
|
|
91
|
+
if task.input_json_schema is not None:
|
|
92
|
+
try:
|
|
93
|
+
validate_schema(json.loads(self.input), task.input_json_schema)
|
|
94
|
+
except json.JSONDecodeError:
|
|
95
|
+
raise ValueError("Input is not a valid JSON object")
|
|
96
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
97
|
+
raise ValueError(f"Input does not match task input schema: {e}")
|
|
98
|
+
self._last_validated_input = self.input
|
|
99
|
+
return self
|
|
100
|
+
|
|
101
|
+
@model_validator(mode="after")
|
|
102
|
+
def validate_output_format(self, info: ValidationInfo) -> Self:
|
|
103
|
+
# Don't validate if loading from file (not new). Too slow.
|
|
104
|
+
# Note: we still validate if editing a loaded model's output.
|
|
105
|
+
if self.loading_from_file(info):
|
|
106
|
+
# Consider loading an existing model as validated.
|
|
107
|
+
self._last_validated_output = self.output.output if self.output else None
|
|
108
|
+
return self
|
|
109
|
+
|
|
110
|
+
# Don't validate unless output has changed since last validation.
|
|
111
|
+
# The validator is slow and costly, don't want it running when setting other fields.
|
|
112
|
+
if (
|
|
113
|
+
hasattr(self, "_last_validated_output")
|
|
114
|
+
and self.output is not None
|
|
115
|
+
and self.output.output == self._last_validated_output
|
|
116
|
+
):
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
task = self.parent_task()
|
|
120
|
+
if task is None:
|
|
121
|
+
return self
|
|
122
|
+
|
|
123
|
+
self.output.validate_output_format(task)
|
|
124
|
+
self._last_validated_output = self.output.output if self.output else None
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
@model_validator(mode="after")
|
|
128
|
+
def validate_repaired_output(self) -> Self:
|
|
129
|
+
if self.repaired_output is not None:
|
|
130
|
+
if self.repaired_output.rating is not None:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
|
|
133
|
+
)
|
|
134
|
+
if self.repair_instructions is None and self.repaired_output is not None:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
"Repair instructions are required if providing a repaired output."
|
|
137
|
+
)
|
|
138
|
+
if self.repair_instructions is not None and self.repaired_output is None:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
"A repaired output is required if providing repair instructions."
|
|
141
|
+
)
|
|
142
|
+
return self
|
|
143
|
+
|
|
144
|
+
@model_validator(mode="after")
|
|
145
|
+
def validate_input_source(self, info: ValidationInfo) -> Self:
|
|
146
|
+
# On strict mode and not loaded from file, we validate input_source is not None.
|
|
147
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
148
|
+
if not strict_mode():
|
|
149
|
+
return self
|
|
150
|
+
if self.loaded_from_file(info):
|
|
151
|
+
return self
|
|
152
|
+
if self.input_source is None:
|
|
153
|
+
raise ValueError("input_source is required when strict mode is enabled")
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
@model_validator(mode="after")
|
|
157
|
+
def validate_tags(self) -> Self:
|
|
158
|
+
for tag in self.tags:
|
|
159
|
+
if not tag:
|
|
160
|
+
raise ValueError("Tags cannot be empty strings")
|
|
161
|
+
if " " in tag:
|
|
162
|
+
raise ValueError("Tags cannot contain spaces. Try underscores.")
|
|
163
|
+
|
|
164
|
+
return self
|
|
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock, patch
|
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
|
-
from kiln_ai.adapters.model_adapters.base_adapter import
|
|
9
|
+
from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter
|
|
10
10
|
from kiln_ai.adapters.run_output import RunOutput
|
|
11
11
|
from kiln_ai.datamodel import Task, TaskRun
|
|
12
12
|
from kiln_ai.datamodel.basemodel import (
|
|
@@ -15,6 +15,7 @@ from kiln_ai.datamodel.basemodel import (
|
|
|
15
15
|
string_to_valid_name,
|
|
16
16
|
)
|
|
17
17
|
from kiln_ai.datamodel.model_cache import ModelCache
|
|
18
|
+
from kiln_ai.datamodel.task import RunConfig
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
@pytest.fixture
|
|
@@ -484,13 +485,8 @@ class MockAdapter(BaseAdapter):
|
|
|
484
485
|
async def _run(self, input):
|
|
485
486
|
return RunOutput(output="test output", intermediate_outputs=None)
|
|
486
487
|
|
|
487
|
-
def
|
|
488
|
-
return
|
|
489
|
-
adapter_name="test",
|
|
490
|
-
model_name=self.model_name,
|
|
491
|
-
model_provider=self.model_provider_name,
|
|
492
|
-
prompt_builder_name="test",
|
|
493
|
-
)
|
|
488
|
+
def adapter_name(self) -> str:
|
|
489
|
+
return "test"
|
|
494
490
|
|
|
495
491
|
|
|
496
492
|
@pytest.fixture
|
|
@@ -501,9 +497,12 @@ def base_task():
|
|
|
501
497
|
@pytest.fixture
|
|
502
498
|
def adapter(base_task):
|
|
503
499
|
return MockAdapter(
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
500
|
+
run_config=RunConfig(
|
|
501
|
+
task=base_task,
|
|
502
|
+
model_name="test_model",
|
|
503
|
+
model_provider_name="test_provider",
|
|
504
|
+
prompt_id="simple_prompt_builder",
|
|
505
|
+
),
|
|
507
506
|
)
|
|
508
507
|
|
|
509
508
|
|
|
@@ -511,6 +510,7 @@ async def test_invoke_parsing_flow(adapter):
|
|
|
511
510
|
# Mock dependencies
|
|
512
511
|
mock_provider = MagicMock()
|
|
513
512
|
mock_provider.parser = "test_parser"
|
|
513
|
+
mock_provider.reasoning_capable = False
|
|
514
514
|
|
|
515
515
|
mock_parser = MagicMock()
|
|
516
516
|
mock_parser.parse_output.return_value = RunOutput(
|
|
@@ -548,3 +548,11 @@ async def test_invoke_parsing_flow(adapter):
|
|
|
548
548
|
assert result.output.output == "parsed test output"
|
|
549
549
|
assert result.intermediate_outputs == {"key": "value"}
|
|
550
550
|
assert result.input == "test input"
|
|
551
|
+
|
|
552
|
+
# Test with reasoning required, that we error if no reasoning is returned
|
|
553
|
+
mock_provider.reasoning_capable = True
|
|
554
|
+
with pytest.raises(
|
|
555
|
+
RuntimeError,
|
|
556
|
+
match="Reasoning is required for this model, but no reasoning was returned.",
|
|
557
|
+
):
|
|
558
|
+
await adapter.invoke("test input")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from kiln_ai.datamodel.dataset_filters import (
|
|
5
|
+
AllDatasetFilter,
|
|
6
|
+
DatasetFilterId,
|
|
7
|
+
HighRatingDatasetFilter,
|
|
8
|
+
StaticDatasetFilters,
|
|
9
|
+
TagFilter,
|
|
10
|
+
ThinkingModelDatasetFilter,
|
|
11
|
+
ThinkingModelHighRatedFilter,
|
|
12
|
+
dataset_filter_from_id,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Note: Many more filter tests in test_dataset_split.py
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_all_dataset_filter_from_id():
|
|
19
|
+
assert dataset_filter_from_id("all") == AllDatasetFilter
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_high_rating_dataset_filter_from_id():
|
|
23
|
+
assert dataset_filter_from_id("high_rating") == HighRatingDatasetFilter
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_thinking_model_dataset_filter_from_id():
|
|
27
|
+
assert dataset_filter_from_id("thinking_model") == ThinkingModelDatasetFilter
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_thinking_model_high_rated_dataset_filter_from_id():
|
|
31
|
+
assert (
|
|
32
|
+
dataset_filter_from_id("thinking_model_high_rated")
|
|
33
|
+
== ThinkingModelHighRatedFilter
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_all_static_dataset_filters():
|
|
38
|
+
for filter_id in StaticDatasetFilters:
|
|
39
|
+
assert dataset_filter_from_id(filter_id) is not None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ModelTester(BaseModel):
|
|
43
|
+
dsid: DatasetFilterId
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.parametrize(
|
|
47
|
+
"tag,expected_error,expected_tag",
|
|
48
|
+
[
|
|
49
|
+
("tag::test", False, "test"),
|
|
50
|
+
("tag::other", False, "other"),
|
|
51
|
+
("tag::", True, None),
|
|
52
|
+
("tag", True, None),
|
|
53
|
+
("", True, None),
|
|
54
|
+
],
|
|
55
|
+
)
|
|
56
|
+
def test_tag_filter(tag, expected_error, expected_tag):
|
|
57
|
+
# Check our model validators
|
|
58
|
+
if expected_error:
|
|
59
|
+
with pytest.raises(ValueError):
|
|
60
|
+
ModelTester(dsid=tag)
|
|
61
|
+
else:
|
|
62
|
+
ModelTester(dsid=tag)
|
|
63
|
+
|
|
64
|
+
# Check the constructor
|
|
65
|
+
if expected_tag is None:
|
|
66
|
+
with pytest.raises(ValueError, match="Invalid dataset filter ID:"):
|
|
67
|
+
dataset_filter_from_id(tag)
|
|
68
|
+
else:
|
|
69
|
+
filter = dataset_filter_from_id(tag)
|
|
70
|
+
assert isinstance(filter, TagFilter)
|
|
71
|
+
assert filter.tag == expected_tag
|