kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,181 @@
1
+ from typing import TYPE_CHECKING, Dict, List, Union
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from kiln_ai.datamodel import Finetune
6
+ from kiln_ai.datamodel.basemodel import (
7
+ ID_FIELD,
8
+ ID_TYPE,
9
+ NAME_FIELD,
10
+ SHORT_NAME_FIELD,
11
+ KilnParentedModel,
12
+ KilnParentModel,
13
+ )
14
+ from kiln_ai.datamodel.datamodel_enums import Priority, TaskOutputRatingType
15
+ from kiln_ai.datamodel.dataset_split import DatasetSplit
16
+ from kiln_ai.datamodel.eval import Eval
17
+ from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
18
+ from kiln_ai.datamodel.prompt import BasePrompt, Prompt
19
+ from kiln_ai.datamodel.prompt_id import PromptId
20
+ from kiln_ai.datamodel.task_run import TaskRun
21
+
22
+ if TYPE_CHECKING:
23
+ from kiln_ai.datamodel.project import Project
24
+
25
+
26
+ class TaskRequirement(BaseModel):
27
+ """
28
+ Defines a specific requirement that should be met by task outputs.
29
+
30
+ Includes an identifier, name, description, instruction for meeting the requirement,
31
+ priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
32
+ """
33
+
34
+ id: ID_TYPE = ID_FIELD
35
+ name: str = SHORT_NAME_FIELD
36
+ description: str | None = Field(default=None)
37
+ instruction: str = Field(min_length=1)
38
+ priority: Priority = Field(default=Priority.p2)
39
+ type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
40
+
41
+
42
+ class RunConfigProperties(BaseModel):
43
+ """
44
+ A configuration for running a task.
45
+
46
+ This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
47
+ """
48
+
49
+ model_name: str = Field(description="The model to use for this run config.")
50
+ model_provider_name: str = Field(
51
+ description="The provider to use for this run config."
52
+ )
53
+ prompt_id: PromptId = Field(
54
+ description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
55
+ )
56
+
57
+
58
+ class RunConfig(RunConfigProperties):
59
+ """
60
+ A configuration for running a task.
61
+
62
+ This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
63
+
64
+ For example: task, model, provider, prompt, etc.
65
+ """
66
+
67
+ task: "Task" = Field(description="The task to run.")
68
+
69
+
70
+ class TaskRunConfig(KilnParentedModel):
71
+ """
72
+ A Kiln model for persisting a run config in a Kiln Project, nested under a task.
73
+
74
+ Typically used to save a method of running a task for evaluation.
75
+
76
+ A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
77
+ """
78
+
79
+ name: str = NAME_FIELD
80
+ description: str | None = Field(
81
+ default=None, description="The description of the task run config."
82
+ )
83
+ run_config_properties: RunConfigProperties = Field(
84
+ description="The run config properties to use for this task run."
85
+ )
86
+ # The prompt_id in the run_config_properties is the prompt ID to use for this task run.
87
+ # However, we want the prompt to be perfectly consistent, and some prompt_ids are dynamic.
88
+ # If we need to "freeze" a prompt, we can do so here (then point the prompt_id to this frozen prompt).
89
+ prompt: BasePrompt | None = Field(
90
+ default=None,
91
+ description="A prompt to use for run config.",
92
+ )
93
+
94
+ # Workaround to return typed parent without importing Task
95
+ def parent_task(self) -> Union["Task", None]:
96
+ if self.parent is None or self.parent.__class__.__name__ != "Task":
97
+ return None
98
+ return self.parent # type: ignore
99
+
100
+ def run_config(self) -> RunConfig:
101
+ parent_task = self.parent_task()
102
+ if parent_task is None:
103
+ raise ValueError("Run config must be parented to a task")
104
+ return RunConfig(
105
+ task=parent_task,
106
+ model_name=self.run_config_properties.model_name,
107
+ model_provider_name=self.run_config_properties.model_provider_name,
108
+ prompt_id=self.run_config_properties.prompt_id,
109
+ )
110
+
111
+
112
+ class Task(
113
+ KilnParentedModel,
114
+ KilnParentModel,
115
+ parent_of={
116
+ "runs": TaskRun,
117
+ "dataset_splits": DatasetSplit,
118
+ "finetunes": Finetune,
119
+ "prompts": Prompt,
120
+ "evals": Eval,
121
+ "run_configs": TaskRunConfig,
122
+ },
123
+ ):
124
+ """
125
+ Represents a specific task to be performed, with associated requirements and validation rules.
126
+
127
+ Contains the task definition, requirements, input/output schemas, and maintains
128
+ a collection of task runs.
129
+ """
130
+
131
+ name: str = NAME_FIELD
132
+ description: str | None = Field(
133
+ default=None,
134
+ description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
135
+ )
136
+ instruction: str = Field(
137
+ min_length=1,
138
+ description="The instructions for the task. Will be used in prompts/training/validation.",
139
+ )
140
+ requirements: List[TaskRequirement] = Field(default=[])
141
+ output_json_schema: JsonObjectSchema | None = None
142
+ input_json_schema: JsonObjectSchema | None = None
143
+ thinking_instruction: str | None = Field(
144
+ default=None,
145
+ description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
146
+ )
147
+
148
+ def output_schema(self) -> Dict | None:
149
+ if self.output_json_schema is None:
150
+ return None
151
+ return schema_from_json_str(self.output_json_schema)
152
+
153
+ def input_schema(self) -> Dict | None:
154
+ if self.input_json_schema is None:
155
+ return None
156
+ return schema_from_json_str(self.input_json_schema)
157
+
158
+ # These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
159
+ def runs(self, readonly: bool = False) -> list[TaskRun]:
160
+ return super().runs(readonly=readonly) # type: ignore
161
+
162
+ def dataset_splits(self, readonly: bool = False) -> list[DatasetSplit]:
163
+ return super().dataset_splits(readonly=readonly) # type: ignore
164
+
165
+ def finetunes(self, readonly: bool = False) -> list[Finetune]:
166
+ return super().finetunes(readonly=readonly) # type: ignore
167
+
168
+ def prompts(self, readonly: bool = False) -> list[Prompt]:
169
+ return super().prompts(readonly=readonly) # type: ignore
170
+
171
+ def evals(self, readonly: bool = False) -> list[Eval]:
172
+ return super().evals(readonly=readonly) # type: ignore
173
+
174
+ def run_configs(self, readonly: bool = False) -> list[TaskRunConfig]:
175
+ return super().run_configs(readonly=readonly) # type: ignore
176
+
177
+ # Workaround to return typed parent without importing Task
178
+ def parent_project(self) -> Union["Project", None]:
179
+ if self.parent is None or self.parent.__class__.__name__ != "Project":
180
+ return None
181
+ return self.parent # type: ignore
@@ -0,0 +1,321 @@
1
+ import json
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Dict, List, Type, Union
4
+
5
+ import jsonschema
6
+ import jsonschema.exceptions
7
+ from pydantic import BaseModel, Field, ValidationInfo, model_validator
8
+ from typing_extensions import Self
9
+
10
+ from kiln_ai.datamodel.basemodel import ID_TYPE, KilnBaseModel
11
+ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
12
+ from kiln_ai.datamodel.json_schema import validate_schema
13
+ from kiln_ai.datamodel.strict_mode import strict_mode
14
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
15
+
16
+ if TYPE_CHECKING:
17
+ from kiln_ai.datamodel.task import Task
18
+
19
+
20
+ class RequirementRating(BaseModel):
21
+ """Rating for a specific requirement within a task output."""
22
+
23
+ value: float = Field(
24
+ description="The rating value. Interpretation depends on rating type"
25
+ )
26
+ type: TaskOutputRatingType = Field(description="The type of rating")
27
+
28
+
29
+ def normalize_rating(rating: float, rating_type: TaskOutputRatingType) -> float:
30
+ """Normalize a rating to a 0-1 scale. Simple normalization, not z-score."""
31
+ match rating_type:
32
+ case TaskOutputRatingType.five_star:
33
+ if rating < 1 or rating > 5:
34
+ raise ValueError("Five star rating must be between 1 and 5")
35
+ return (rating - 1) / 4
36
+ case TaskOutputRatingType.pass_fail:
37
+ if rating < 0 or rating > 1:
38
+ raise ValueError("Pass fail rating must 0 to 1")
39
+ return rating
40
+ case TaskOutputRatingType.pass_fail_critical:
41
+ if rating < -1 or rating > 1:
42
+ raise ValueError("Pass fail critical rating must -1 to 1")
43
+ return (rating + 1) / 2 # -1 to 1
44
+ case TaskOutputRatingType.custom:
45
+ raise ValueError("Custom rating type can not be normalized")
46
+ case _:
47
+ raise_exhaustive_enum_error(rating_type)
48
+
49
+
50
+ class TaskOutputRating(KilnBaseModel):
51
+ """
52
+ A rating for a task output, including an overall rating and ratings for each requirement.
53
+
54
+ Supports:
55
+ - five_star: 1-5 star ratings
56
+ - pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
57
+ - pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
58
+ """
59
+
60
+ type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
61
+ value: float | None = Field(
62
+ description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
63
+ default=None,
64
+ )
65
+ requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
66
+ default={},
67
+ description="The ratings of the requirements of the task.",
68
+ )
69
+
70
+ # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
71
+ @model_validator(mode="before")
72
+ def upgrade_old_format(cls, data: dict) -> dict:
73
+ if not isinstance(data, dict):
74
+ return data
75
+
76
+ # Check if we have the old format (dict of floats)
77
+ req_ratings = data.get("requirement_ratings", {})
78
+ if req_ratings and all(
79
+ isinstance(v, (int, float)) for v in req_ratings.values()
80
+ ):
81
+ # Convert each float to a RequirementRating object
82
+ # all ratings are five star at the point we used this format
83
+ data["requirement_ratings"] = {
84
+ k: {"value": v, "type": TaskOutputRatingType.five_star}
85
+ for k, v in req_ratings.items()
86
+ }
87
+
88
+ return data
89
+
90
+ # Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
91
+ def is_high_quality(self) -> bool:
92
+ if self.value is None:
93
+ return False
94
+
95
+ if self.type == TaskOutputRatingType.five_star:
96
+ return self.value >= 4
97
+ elif self.type == TaskOutputRatingType.pass_fail:
98
+ return self.value == 1.0
99
+ elif self.type == TaskOutputRatingType.pass_fail_critical:
100
+ return self.value == 1.0
101
+ return False
102
+
103
+ @model_validator(mode="after")
104
+ def validate_rating(self) -> Self:
105
+ if self.type not in TaskOutputRatingType:
106
+ raise ValueError(f"Invalid rating type: {self.type}")
107
+
108
+ # Overall rating is optional
109
+ if self.value is not None:
110
+ self._validate_rating(self.type, self.value, "overall rating")
111
+
112
+ for req_id, req_rating in self.requirement_ratings.items():
113
+ self._validate_rating(
114
+ req_rating.type,
115
+ req_rating.value,
116
+ f"requirement rating for req ID: {req_id}",
117
+ )
118
+
119
+ return self
120
+
121
+ def _validate_rating(
122
+ self, type: TaskOutputRatingType, rating: float | None, rating_name: str
123
+ ) -> None:
124
+ if type == TaskOutputRatingType.five_star:
125
+ self._validate_five_star(rating, rating_name)
126
+ elif type == TaskOutputRatingType.pass_fail:
127
+ self._validate_pass_fail(rating, rating_name)
128
+ elif type == TaskOutputRatingType.pass_fail_critical:
129
+ self._validate_pass_fail_critical(rating, rating_name)
130
+
131
+ def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
132
+ if rating is None or not isinstance(rating, float) or not rating.is_integer():
133
+ raise ValueError(
134
+ f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
135
+ )
136
+ if rating < 1 or rating > 5:
137
+ raise ValueError(
138
+ f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
139
+ )
140
+
141
+ def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
142
+ if rating is None or not isinstance(rating, float) or not rating.is_integer():
143
+ raise ValueError(
144
+ f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
145
+ )
146
+ if rating not in [0, 1]:
147
+ raise ValueError(
148
+ f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
149
+ )
150
+
151
+ def _validate_pass_fail_critical(
152
+ self, rating: float | None, rating_name: str
153
+ ) -> None:
154
+ if rating is None or not isinstance(rating, float) or not rating.is_integer():
155
+ raise ValueError(
156
+ f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
157
+ )
158
+ if rating not in [-1, 0, 1]:
159
+ raise ValueError(
160
+ f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
161
+ )
162
+
163
+
164
+ class DataSourceType(str, Enum):
165
+ """
166
+ The source type of a piece of data.
167
+
168
+ Human: a human created the data
169
+ Synthetic: a model created the data
170
+ """
171
+
172
+ human = "human"
173
+ synthetic = "synthetic"
174
+
175
+
176
+ class DataSourceProperty(BaseModel):
177
+ """
178
+ Defines a property that can be associated with a data source.
179
+
180
+ Includes validation rules for when properties are required or not allowed
181
+ based on the data source type.
182
+ """
183
+
184
+ name: str
185
+ type: Type[Union[str, int, float]]
186
+ required_for: List[DataSourceType] = []
187
+ not_allowed_for: List[DataSourceType] = []
188
+
189
+
190
+ class DataSource(BaseModel):
191
+ """
192
+ Represents the origin of data, either human or synthetic, with associated properties.
193
+
194
+ Properties vary based on the source type - for synthetic sources this includes
195
+ model information, for human sources this includes creator information.
196
+ """
197
+
198
+ type: DataSourceType
199
+ properties: Dict[str, str | int | float] = Field(
200
+ default={},
201
+ description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
202
+ )
203
+
204
+ _data_source_properties = [
205
+ DataSourceProperty(
206
+ name="created_by",
207
+ type=str,
208
+ required_for=[DataSourceType.human],
209
+ not_allowed_for=[DataSourceType.synthetic],
210
+ ),
211
+ DataSourceProperty(
212
+ name="model_name",
213
+ type=str,
214
+ required_for=[DataSourceType.synthetic],
215
+ not_allowed_for=[DataSourceType.human],
216
+ ),
217
+ DataSourceProperty(
218
+ name="model_provider",
219
+ type=str,
220
+ required_for=[DataSourceType.synthetic],
221
+ not_allowed_for=[DataSourceType.human],
222
+ ),
223
+ DataSourceProperty(
224
+ name="adapter_name",
225
+ type=str,
226
+ required_for=[DataSourceType.synthetic],
227
+ not_allowed_for=[DataSourceType.human],
228
+ ),
229
+ DataSourceProperty(
230
+ # Legacy field -- allow loading from old runs, but we shouldn't be setting it.
231
+ name="prompt_builder_name",
232
+ type=str,
233
+ not_allowed_for=[DataSourceType.human],
234
+ ),
235
+ DataSourceProperty(
236
+ # The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details.
237
+ name="prompt_id",
238
+ type=str,
239
+ not_allowed_for=[DataSourceType.human],
240
+ ),
241
+ ]
242
+
243
+ @model_validator(mode="after")
244
+ def validate_type(self) -> "DataSource":
245
+ if self.type not in DataSourceType:
246
+ raise ValueError(f"Invalid data source type: {self.type}")
247
+ return self
248
+
249
+ @model_validator(mode="after")
250
+ def validate_properties(self) -> "DataSource":
251
+ for prop in self._data_source_properties:
252
+ # Check the property type is correct
253
+ if prop.name in self.properties:
254
+ if not isinstance(self.properties[prop.name], prop.type):
255
+ raise ValueError(
256
+ f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
257
+ )
258
+ # Check the property is required for the data source type
259
+ if self.type in prop.required_for:
260
+ if prop.name not in self.properties:
261
+ raise ValueError(
262
+ f"'{prop.name}' is required for {self.type} data source"
263
+ )
264
+ # Check the property is not allowed for the data source type
265
+ elif self.type in prop.not_allowed_for and prop.name in self.properties:
266
+ raise ValueError(
267
+ f"'{prop.name}' is not allowed for {self.type} data source"
268
+ )
269
+ return self
270
+
271
+ @model_validator(mode="after")
272
+ def validate_no_empty_properties(self) -> Self:
273
+ for prop, value in self.properties.items():
274
+ if isinstance(value, str) and value == "":
275
+ raise ValueError(
276
+ f"Property '{prop}' must be a non-empty string for {self.type} data source"
277
+ )
278
+ return self
279
+
280
+
281
+ class TaskOutput(KilnBaseModel):
282
+ """
283
+ An output for a specific task run.
284
+
285
+ Contains the actual output content, its source (human or synthetic),
286
+ and optional rating information.
287
+ """
288
+
289
+ output: str = Field(
290
+ description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
291
+ )
292
+ source: DataSource | None = Field(
293
+ description="The source of the output: human or synthetic.",
294
+ default=None,
295
+ )
296
+ rating: TaskOutputRating | None = Field(
297
+ default=None, description="The rating of the output"
298
+ )
299
+
300
+ def validate_output_format(self, task: "Task") -> Self:
301
+ # validate output
302
+ if task.output_json_schema is not None:
303
+ try:
304
+ validate_schema(json.loads(self.output), task.output_json_schema)
305
+ except json.JSONDecodeError:
306
+ raise ValueError("Output is not a valid JSON object")
307
+ except jsonschema.exceptions.ValidationError as e:
308
+ raise ValueError(f"Output does not match task output schema: {e}")
309
+ return self
310
+
311
+ @model_validator(mode="after")
312
+ def validate_output_source(self, info: ValidationInfo) -> Self:
313
+ # On strict mode and not loaded from file, we validate output_source is not None.
314
+ # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
315
+ if not strict_mode():
316
+ return self
317
+ if self.loaded_from_file(info):
318
+ return self
319
+ if self.source is None:
320
+ raise ValueError("Output source is required when strict mode is enabled")
321
+ return self
@@ -0,0 +1,164 @@
1
+ import json
2
+ from typing import TYPE_CHECKING, Dict, List, Union
3
+
4
+ import jsonschema
5
+ import jsonschema.exceptions
6
+ from pydantic import Field, ValidationInfo, model_validator
7
+ from typing_extensions import Self
8
+
9
+ from kiln_ai.datamodel.basemodel import KilnParentedModel
10
+ from kiln_ai.datamodel.json_schema import validate_schema
11
+ from kiln_ai.datamodel.strict_mode import strict_mode
12
+ from kiln_ai.datamodel.task_output import DataSource, TaskOutput
13
+
14
+ if TYPE_CHECKING:
15
+ from kiln_ai.datamodel.task import Task
16
+
17
+
18
+ class TaskRun(KilnParentedModel):
19
+ """
20
+ Represents a single execution of a Task.
21
+
22
+ Contains the input used, its source, the output produced, and optional
23
+ repair information if the output needed correction.
24
+ """
25
+
26
+ input: str = Field(
27
+ description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
28
+ )
29
+ input_source: DataSource | None = Field(
30
+ default=None, description="The source of the input: human or synthetic."
31
+ )
32
+
33
+ output: TaskOutput = Field(description="The output of the task run.")
34
+ repair_instructions: str | None = Field(
35
+ default=None,
36
+ description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
37
+ )
38
+ repaired_output: TaskOutput | None = Field(
39
+ default=None,
40
+ description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
41
+ )
42
+ intermediate_outputs: Dict[str, str] | None = Field(
43
+ default=None,
44
+ description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
45
+ )
46
+ tags: List[str] = Field(
47
+ default=[],
48
+ description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
49
+ )
50
+
51
+ def has_thinking_training_data(self) -> bool:
52
+ """
53
+ Does this run have thinking data that we can use to train a thinking model?
54
+ """
55
+ if self.intermediate_outputs is None:
56
+ return False
57
+ return (
58
+ "chain_of_thought" in self.intermediate_outputs
59
+ or "reasoning" in self.intermediate_outputs
60
+ )
61
+
62
+ # Workaround to return typed parent without importing Task
63
+ def parent_task(self) -> Union["Task", None]:
64
+ if self.parent is None or self.parent.__class__.__name__ != "Task":
65
+ return None
66
+ return self.parent # type: ignore
67
+
68
+ @model_validator(mode="after")
69
+ def validate_input_format(self, info: ValidationInfo) -> Self:
70
+ # Don't validate if loading from file (not new). Too slow.
71
+ # We don't allow changing task schema, so this is redundant validation.
72
+ # Note: we still validate if editing a loaded model
73
+ if self.loading_from_file(info):
74
+ # Consider loading an existing model as validated.
75
+ self._last_validated_input = self.input
76
+ return self
77
+
78
+ # Don't validate if input has not changed. Too slow to run this every time.
79
+ if (
80
+ hasattr(self, "_last_validated_input")
81
+ and self.input == self._last_validated_input
82
+ ):
83
+ return self
84
+
85
+ task = self.parent_task()
86
+ if task is None:
87
+ # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
88
+ return self
89
+
90
+ # validate output
91
+ if task.input_json_schema is not None:
92
+ try:
93
+ validate_schema(json.loads(self.input), task.input_json_schema)
94
+ except json.JSONDecodeError:
95
+ raise ValueError("Input is not a valid JSON object")
96
+ except jsonschema.exceptions.ValidationError as e:
97
+ raise ValueError(f"Input does not match task input schema: {e}")
98
+ self._last_validated_input = self.input
99
+ return self
100
+
101
+ @model_validator(mode="after")
102
+ def validate_output_format(self, info: ValidationInfo) -> Self:
103
+ # Don't validate if loading from file (not new). Too slow.
104
+ # Note: we still validate if editing a loaded model's output.
105
+ if self.loading_from_file(info):
106
+ # Consider loading an existing model as validated.
107
+ self._last_validated_output = self.output.output if self.output else None
108
+ return self
109
+
110
+ # Don't validate unless output has changed since last validation.
111
+ # The validator is slow and costly, don't want it running when setting other fields.
112
+ if (
113
+ hasattr(self, "_last_validated_output")
114
+ and self.output is not None
115
+ and self.output.output == self._last_validated_output
116
+ ):
117
+ return self
118
+
119
+ task = self.parent_task()
120
+ if task is None:
121
+ return self
122
+
123
+ self.output.validate_output_format(task)
124
+ self._last_validated_output = self.output.output if self.output else None
125
+ return self
126
+
127
+ @model_validator(mode="after")
128
+ def validate_repaired_output(self) -> Self:
129
+ if self.repaired_output is not None:
130
+ if self.repaired_output.rating is not None:
131
+ raise ValueError(
132
+ "Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
133
+ )
134
+ if self.repair_instructions is None and self.repaired_output is not None:
135
+ raise ValueError(
136
+ "Repair instructions are required if providing a repaired output."
137
+ )
138
+ if self.repair_instructions is not None and self.repaired_output is None:
139
+ raise ValueError(
140
+ "A repaired output is required if providing repair instructions."
141
+ )
142
+ return self
143
+
144
+ @model_validator(mode="after")
145
+ def validate_input_source(self, info: ValidationInfo) -> Self:
146
+ # On strict mode and not loaded from file, we validate input_source is not None.
147
+ # We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
148
+ if not strict_mode():
149
+ return self
150
+ if self.loaded_from_file(info):
151
+ return self
152
+ if self.input_source is None:
153
+ raise ValueError("input_source is required when strict mode is enabled")
154
+ return self
155
+
156
+ @model_validator(mode="after")
157
+ def validate_tags(self) -> Self:
158
+ for tag in self.tags:
159
+ if not tag:
160
+ raise ValueError("Tags cannot be empty strings")
161
+ if " " in tag:
162
+ raise ValueError("Tags cannot contain spaces. Try underscores.")
163
+
164
+ return self