kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +7 -7
- kiln_ai/adapters/adapter_registry.py +81 -10
- kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
- kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
- kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
- kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
- kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
- kiln_ai/adapters/ml_model_list.py +434 -93
- kiln_ai/adapters/model_adapters/__init__.py +18 -0
- kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
- kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
- kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
- kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/parsers/__init__.py +10 -0
- kiln_ai/adapters/parsers/base_parser.py +12 -0
- kiln_ai/adapters/parsers/json_parser.py +37 -0
- kiln_ai/adapters/parsers/parser_registry.py +19 -0
- kiln_ai/adapters/parsers/r1_parser.py +69 -0
- kiln_ai/adapters/parsers/test_json_parser.py +81 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
- kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
- kiln_ai/adapters/prompt_builders.py +193 -49
- kiln_ai/adapters/provider_tools.py +91 -36
- kiln_ai/adapters/repair/repair_task.py +18 -19
- kiln_ai/adapters/repair/test_repair_task.py +7 -7
- kiln_ai/adapters/run_output.py +11 -0
- kiln_ai/adapters/test_adapter_registry.py +177 -0
- kiln_ai/adapters/test_generate_docs.py +69 -0
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +25 -18
- kiln_ai/adapters/test_prompt_builders.py +265 -44
- kiln_ai/adapters/test_provider_tools.py +268 -46
- kiln_ai/datamodel/__init__.py +51 -772
- kiln_ai/datamodel/basemodel.py +31 -11
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +14 -3
- kiln_ai/datamodel/model_cache.py +8 -3
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +80 -2
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +127 -6
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +34 -17
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_model_cache.py +24 -0
- kiln_ai/datamodel/test_model_perf.py +125 -0
- kiln_ai/datamodel/test_models.py +131 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- kiln_ai/utils/exhaustive_error.py +6 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai/adapters/base_adapter.py +0 -191
- kiln_ai/adapters/langchain_adapters.py +0 -256
- kiln_ai-0.8.1.dist-info/RECORD +0 -58
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from kiln_ai.datamodel import Finetune
|
|
6
|
+
from kiln_ai.datamodel.basemodel import (
|
|
7
|
+
ID_FIELD,
|
|
8
|
+
ID_TYPE,
|
|
9
|
+
NAME_FIELD,
|
|
10
|
+
SHORT_NAME_FIELD,
|
|
11
|
+
KilnParentedModel,
|
|
12
|
+
KilnParentModel,
|
|
13
|
+
)
|
|
14
|
+
from kiln_ai.datamodel.datamodel_enums import Priority, TaskOutputRatingType
|
|
15
|
+
from kiln_ai.datamodel.dataset_split import DatasetSplit
|
|
16
|
+
from kiln_ai.datamodel.eval import Eval
|
|
17
|
+
from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
|
|
18
|
+
from kiln_ai.datamodel.prompt import BasePrompt, Prompt
|
|
19
|
+
from kiln_ai.datamodel.prompt_id import PromptId
|
|
20
|
+
from kiln_ai.datamodel.task_run import TaskRun
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from kiln_ai.datamodel.project import Project
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TaskRequirement(BaseModel):
|
|
27
|
+
"""
|
|
28
|
+
Defines a specific requirement that should be met by task outputs.
|
|
29
|
+
|
|
30
|
+
Includes an identifier, name, description, instruction for meeting the requirement,
|
|
31
|
+
priority level, and rating type (five_star, pass_fail, pass_fail_critical, custom).
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
id: ID_TYPE = ID_FIELD
|
|
35
|
+
name: str = SHORT_NAME_FIELD
|
|
36
|
+
description: str | None = Field(default=None)
|
|
37
|
+
instruction: str = Field(min_length=1)
|
|
38
|
+
priority: Priority = Field(default=Priority.p2)
|
|
39
|
+
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RunConfigProperties(BaseModel):
|
|
43
|
+
"""
|
|
44
|
+
A configuration for running a task.
|
|
45
|
+
|
|
46
|
+
This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
model_name: str = Field(description="The model to use for this run config.")
|
|
50
|
+
model_provider_name: str = Field(
|
|
51
|
+
description="The provider to use for this run config."
|
|
52
|
+
)
|
|
53
|
+
prompt_id: PromptId = Field(
|
|
54
|
+
description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class RunConfig(RunConfigProperties):
|
|
59
|
+
"""
|
|
60
|
+
A configuration for running a task.
|
|
61
|
+
|
|
62
|
+
This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
63
|
+
|
|
64
|
+
For example: task, model, provider, prompt, etc.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
task: "Task" = Field(description="The task to run.")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TaskRunConfig(KilnParentedModel):
|
|
71
|
+
"""
|
|
72
|
+
A Kiln model for persisting a run config in a Kiln Project, nested under a task.
|
|
73
|
+
|
|
74
|
+
Typically used to save a method of running a task for evaluation.
|
|
75
|
+
|
|
76
|
+
A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic).
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
name: str = NAME_FIELD
|
|
80
|
+
description: str | None = Field(
|
|
81
|
+
default=None, description="The description of the task run config."
|
|
82
|
+
)
|
|
83
|
+
run_config_properties: RunConfigProperties = Field(
|
|
84
|
+
description="The run config properties to use for this task run."
|
|
85
|
+
)
|
|
86
|
+
# The prompt_id in the run_config_properties is the prompt ID to use for this task run.
|
|
87
|
+
# However, we want the prompt to be perfectly consistent, and some prompt_ids are dynamic.
|
|
88
|
+
# If we need to "freeze" a prompt, we can do so here (then point the prompt_id to this frozen prompt).
|
|
89
|
+
prompt: BasePrompt | None = Field(
|
|
90
|
+
default=None,
|
|
91
|
+
description="A prompt to use for run config.",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Workaround to return typed parent without importing Task
|
|
95
|
+
def parent_task(self) -> Union["Task", None]:
|
|
96
|
+
if self.parent is None or self.parent.__class__.__name__ != "Task":
|
|
97
|
+
return None
|
|
98
|
+
return self.parent # type: ignore
|
|
99
|
+
|
|
100
|
+
def run_config(self) -> RunConfig:
|
|
101
|
+
parent_task = self.parent_task()
|
|
102
|
+
if parent_task is None:
|
|
103
|
+
raise ValueError("Run config must be parented to a task")
|
|
104
|
+
return RunConfig(
|
|
105
|
+
task=parent_task,
|
|
106
|
+
model_name=self.run_config_properties.model_name,
|
|
107
|
+
model_provider_name=self.run_config_properties.model_provider_name,
|
|
108
|
+
prompt_id=self.run_config_properties.prompt_id,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class Task(
|
|
113
|
+
KilnParentedModel,
|
|
114
|
+
KilnParentModel,
|
|
115
|
+
parent_of={
|
|
116
|
+
"runs": TaskRun,
|
|
117
|
+
"dataset_splits": DatasetSplit,
|
|
118
|
+
"finetunes": Finetune,
|
|
119
|
+
"prompts": Prompt,
|
|
120
|
+
"evals": Eval,
|
|
121
|
+
"run_configs": TaskRunConfig,
|
|
122
|
+
},
|
|
123
|
+
):
|
|
124
|
+
"""
|
|
125
|
+
Represents a specific task to be performed, with associated requirements and validation rules.
|
|
126
|
+
|
|
127
|
+
Contains the task definition, requirements, input/output schemas, and maintains
|
|
128
|
+
a collection of task runs.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
name: str = NAME_FIELD
|
|
132
|
+
description: str | None = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
description="A description of the task for you and your team. Will not be used in prompts/training/validation.",
|
|
135
|
+
)
|
|
136
|
+
instruction: str = Field(
|
|
137
|
+
min_length=1,
|
|
138
|
+
description="The instructions for the task. Will be used in prompts/training/validation.",
|
|
139
|
+
)
|
|
140
|
+
requirements: List[TaskRequirement] = Field(default=[])
|
|
141
|
+
output_json_schema: JsonObjectSchema | None = None
|
|
142
|
+
input_json_schema: JsonObjectSchema | None = None
|
|
143
|
+
thinking_instruction: str | None = Field(
|
|
144
|
+
default=None,
|
|
145
|
+
description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting.",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def output_schema(self) -> Dict | None:
|
|
149
|
+
if self.output_json_schema is None:
|
|
150
|
+
return None
|
|
151
|
+
return schema_from_json_str(self.output_json_schema)
|
|
152
|
+
|
|
153
|
+
def input_schema(self) -> Dict | None:
|
|
154
|
+
if self.input_json_schema is None:
|
|
155
|
+
return None
|
|
156
|
+
return schema_from_json_str(self.input_json_schema)
|
|
157
|
+
|
|
158
|
+
# These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
|
|
159
|
+
def runs(self, readonly: bool = False) -> list[TaskRun]:
|
|
160
|
+
return super().runs(readonly=readonly) # type: ignore
|
|
161
|
+
|
|
162
|
+
def dataset_splits(self, readonly: bool = False) -> list[DatasetSplit]:
|
|
163
|
+
return super().dataset_splits(readonly=readonly) # type: ignore
|
|
164
|
+
|
|
165
|
+
def finetunes(self, readonly: bool = False) -> list[Finetune]:
|
|
166
|
+
return super().finetunes(readonly=readonly) # type: ignore
|
|
167
|
+
|
|
168
|
+
def prompts(self, readonly: bool = False) -> list[Prompt]:
|
|
169
|
+
return super().prompts(readonly=readonly) # type: ignore
|
|
170
|
+
|
|
171
|
+
def evals(self, readonly: bool = False) -> list[Eval]:
|
|
172
|
+
return super().evals(readonly=readonly) # type: ignore
|
|
173
|
+
|
|
174
|
+
def run_configs(self, readonly: bool = False) -> list[TaskRunConfig]:
|
|
175
|
+
return super().run_configs(readonly=readonly) # type: ignore
|
|
176
|
+
|
|
177
|
+
# Workaround to return typed parent without importing Task
|
|
178
|
+
def parent_project(self) -> Union["Project", None]:
|
|
179
|
+
if self.parent is None or self.parent.__class__.__name__ != "Project":
|
|
180
|
+
return None
|
|
181
|
+
return self.parent # type: ignore
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Type, Union
|
|
4
|
+
|
|
5
|
+
import jsonschema
|
|
6
|
+
import jsonschema.exceptions
|
|
7
|
+
from pydantic import BaseModel, Field, ValidationInfo, model_validator
|
|
8
|
+
from typing_extensions import Self
|
|
9
|
+
|
|
10
|
+
from kiln_ai.datamodel.basemodel import ID_TYPE, KilnBaseModel
|
|
11
|
+
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
|
|
12
|
+
from kiln_ai.datamodel.json_schema import validate_schema
|
|
13
|
+
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
14
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from kiln_ai.datamodel.task import Task
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RequirementRating(BaseModel):
|
|
21
|
+
"""Rating for a specific requirement within a task output."""
|
|
22
|
+
|
|
23
|
+
value: float = Field(
|
|
24
|
+
description="The rating value. Interpretation depends on rating type"
|
|
25
|
+
)
|
|
26
|
+
type: TaskOutputRatingType = Field(description="The type of rating")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_rating(rating: float, rating_type: TaskOutputRatingType) -> float:
|
|
30
|
+
"""Normalize a rating to a 0-1 scale. Simple normalization, not z-score."""
|
|
31
|
+
match rating_type:
|
|
32
|
+
case TaskOutputRatingType.five_star:
|
|
33
|
+
if rating < 1 or rating > 5:
|
|
34
|
+
raise ValueError("Five star rating must be between 1 and 5")
|
|
35
|
+
return (rating - 1) / 4
|
|
36
|
+
case TaskOutputRatingType.pass_fail:
|
|
37
|
+
if rating < 0 or rating > 1:
|
|
38
|
+
raise ValueError("Pass fail rating must 0 to 1")
|
|
39
|
+
return rating
|
|
40
|
+
case TaskOutputRatingType.pass_fail_critical:
|
|
41
|
+
if rating < -1 or rating > 1:
|
|
42
|
+
raise ValueError("Pass fail critical rating must -1 to 1")
|
|
43
|
+
return (rating + 1) / 2 # -1 to 1
|
|
44
|
+
case TaskOutputRatingType.custom:
|
|
45
|
+
raise ValueError("Custom rating type can not be normalized")
|
|
46
|
+
case _:
|
|
47
|
+
raise_exhaustive_enum_error(rating_type)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TaskOutputRating(KilnBaseModel):
|
|
51
|
+
"""
|
|
52
|
+
A rating for a task output, including an overall rating and ratings for each requirement.
|
|
53
|
+
|
|
54
|
+
Supports:
|
|
55
|
+
- five_star: 1-5 star ratings
|
|
56
|
+
- pass_fail: boolean pass/fail (1.0 = pass, 0.0 = fail)
|
|
57
|
+
- pass_fail_critical: tri-state (1.0 = pass, 0.0 = fail, -1.0 = critical fail)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star)
|
|
61
|
+
value: float | None = Field(
|
|
62
|
+
description="The rating value. Interpretation depends on rating type:\n- five_star: 1-5 stars\n- pass_fail: 1.0 (pass) or 0.0 (fail)\n- pass_fail_critical: 1.0 (pass), 0.0 (fail), or -1.0 (critical fail)",
|
|
63
|
+
default=None,
|
|
64
|
+
)
|
|
65
|
+
requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
|
|
66
|
+
default={},
|
|
67
|
+
description="The ratings of the requirements of the task.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
|
|
71
|
+
@model_validator(mode="before")
|
|
72
|
+
def upgrade_old_format(cls, data: dict) -> dict:
|
|
73
|
+
if not isinstance(data, dict):
|
|
74
|
+
return data
|
|
75
|
+
|
|
76
|
+
# Check if we have the old format (dict of floats)
|
|
77
|
+
req_ratings = data.get("requirement_ratings", {})
|
|
78
|
+
if req_ratings and all(
|
|
79
|
+
isinstance(v, (int, float)) for v in req_ratings.values()
|
|
80
|
+
):
|
|
81
|
+
# Convert each float to a RequirementRating object
|
|
82
|
+
# all ratings are five star at the point we used this format
|
|
83
|
+
data["requirement_ratings"] = {
|
|
84
|
+
k: {"value": v, "type": TaskOutputRatingType.five_star}
|
|
85
|
+
for k, v in req_ratings.items()
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return data
|
|
89
|
+
|
|
90
|
+
# Used to select high quality outputs for example selection (MultiShotPromptBuilder, etc)
|
|
91
|
+
def is_high_quality(self) -> bool:
|
|
92
|
+
if self.value is None:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
if self.type == TaskOutputRatingType.five_star:
|
|
96
|
+
return self.value >= 4
|
|
97
|
+
elif self.type == TaskOutputRatingType.pass_fail:
|
|
98
|
+
return self.value == 1.0
|
|
99
|
+
elif self.type == TaskOutputRatingType.pass_fail_critical:
|
|
100
|
+
return self.value == 1.0
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
@model_validator(mode="after")
|
|
104
|
+
def validate_rating(self) -> Self:
|
|
105
|
+
if self.type not in TaskOutputRatingType:
|
|
106
|
+
raise ValueError(f"Invalid rating type: {self.type}")
|
|
107
|
+
|
|
108
|
+
# Overall rating is optional
|
|
109
|
+
if self.value is not None:
|
|
110
|
+
self._validate_rating(self.type, self.value, "overall rating")
|
|
111
|
+
|
|
112
|
+
for req_id, req_rating in self.requirement_ratings.items():
|
|
113
|
+
self._validate_rating(
|
|
114
|
+
req_rating.type,
|
|
115
|
+
req_rating.value,
|
|
116
|
+
f"requirement rating for req ID: {req_id}",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return self
|
|
120
|
+
|
|
121
|
+
def _validate_rating(
|
|
122
|
+
self, type: TaskOutputRatingType, rating: float | None, rating_name: str
|
|
123
|
+
) -> None:
|
|
124
|
+
if type == TaskOutputRatingType.five_star:
|
|
125
|
+
self._validate_five_star(rating, rating_name)
|
|
126
|
+
elif type == TaskOutputRatingType.pass_fail:
|
|
127
|
+
self._validate_pass_fail(rating, rating_name)
|
|
128
|
+
elif type == TaskOutputRatingType.pass_fail_critical:
|
|
129
|
+
self._validate_pass_fail_critical(rating, rating_name)
|
|
130
|
+
|
|
131
|
+
def _validate_five_star(self, rating: float | None, rating_name: str) -> None:
|
|
132
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"{rating_name.capitalize()} of type five_star must be an integer value (1-5)"
|
|
135
|
+
)
|
|
136
|
+
if rating < 1 or rating > 5:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"{rating_name.capitalize()} of type five_star must be between 1 and 5 stars"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _validate_pass_fail(self, rating: float | None, rating_name: str) -> None:
|
|
142
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"{rating_name.capitalize()} of type pass_fail must be an integer value (0 or 1)"
|
|
145
|
+
)
|
|
146
|
+
if rating not in [0, 1]:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"{rating_name.capitalize()} of type pass_fail must be 0 (fail) or 1 (pass)"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _validate_pass_fail_critical(
|
|
152
|
+
self, rating: float | None, rating_name: str
|
|
153
|
+
) -> None:
|
|
154
|
+
if rating is None or not isinstance(rating, float) or not rating.is_integer():
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"{rating_name.capitalize()} of type pass_fail_critical must be an integer value (-1, 0, or 1)"
|
|
157
|
+
)
|
|
158
|
+
if rating not in [-1, 0, 1]:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"{rating_name.capitalize()} of type pass_fail_critical must be -1 (critical fail), 0 (fail), or 1 (pass)"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class DataSourceType(str, Enum):
|
|
165
|
+
"""
|
|
166
|
+
The source type of a piece of data.
|
|
167
|
+
|
|
168
|
+
Human: a human created the data
|
|
169
|
+
Synthetic: a model created the data
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
human = "human"
|
|
173
|
+
synthetic = "synthetic"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class DataSourceProperty(BaseModel):
|
|
177
|
+
"""
|
|
178
|
+
Defines a property that can be associated with a data source.
|
|
179
|
+
|
|
180
|
+
Includes validation rules for when properties are required or not allowed
|
|
181
|
+
based on the data source type.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
name: str
|
|
185
|
+
type: Type[Union[str, int, float]]
|
|
186
|
+
required_for: List[DataSourceType] = []
|
|
187
|
+
not_allowed_for: List[DataSourceType] = []
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class DataSource(BaseModel):
|
|
191
|
+
"""
|
|
192
|
+
Represents the origin of data, either human or synthetic, with associated properties.
|
|
193
|
+
|
|
194
|
+
Properties vary based on the source type - for synthetic sources this includes
|
|
195
|
+
model information, for human sources this includes creator information.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
type: DataSourceType
|
|
199
|
+
properties: Dict[str, str | int | float] = Field(
|
|
200
|
+
default={},
|
|
201
|
+
description="Properties describing the data source. For synthetic things like model. For human, the human's name.",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
_data_source_properties = [
|
|
205
|
+
DataSourceProperty(
|
|
206
|
+
name="created_by",
|
|
207
|
+
type=str,
|
|
208
|
+
required_for=[DataSourceType.human],
|
|
209
|
+
not_allowed_for=[DataSourceType.synthetic],
|
|
210
|
+
),
|
|
211
|
+
DataSourceProperty(
|
|
212
|
+
name="model_name",
|
|
213
|
+
type=str,
|
|
214
|
+
required_for=[DataSourceType.synthetic],
|
|
215
|
+
not_allowed_for=[DataSourceType.human],
|
|
216
|
+
),
|
|
217
|
+
DataSourceProperty(
|
|
218
|
+
name="model_provider",
|
|
219
|
+
type=str,
|
|
220
|
+
required_for=[DataSourceType.synthetic],
|
|
221
|
+
not_allowed_for=[DataSourceType.human],
|
|
222
|
+
),
|
|
223
|
+
DataSourceProperty(
|
|
224
|
+
name="adapter_name",
|
|
225
|
+
type=str,
|
|
226
|
+
required_for=[DataSourceType.synthetic],
|
|
227
|
+
not_allowed_for=[DataSourceType.human],
|
|
228
|
+
),
|
|
229
|
+
DataSourceProperty(
|
|
230
|
+
# Legacy field -- allow loading from old runs, but we shouldn't be setting it.
|
|
231
|
+
name="prompt_builder_name",
|
|
232
|
+
type=str,
|
|
233
|
+
not_allowed_for=[DataSourceType.human],
|
|
234
|
+
),
|
|
235
|
+
DataSourceProperty(
|
|
236
|
+
# The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details.
|
|
237
|
+
name="prompt_id",
|
|
238
|
+
type=str,
|
|
239
|
+
not_allowed_for=[DataSourceType.human],
|
|
240
|
+
),
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
@model_validator(mode="after")
|
|
244
|
+
def validate_type(self) -> "DataSource":
|
|
245
|
+
if self.type not in DataSourceType:
|
|
246
|
+
raise ValueError(f"Invalid data source type: {self.type}")
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
@model_validator(mode="after")
|
|
250
|
+
def validate_properties(self) -> "DataSource":
|
|
251
|
+
for prop in self._data_source_properties:
|
|
252
|
+
# Check the property type is correct
|
|
253
|
+
if prop.name in self.properties:
|
|
254
|
+
if not isinstance(self.properties[prop.name], prop.type):
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"'{prop.name}' must be of type {prop.type.__name__} for {self.type} data source"
|
|
257
|
+
)
|
|
258
|
+
# Check the property is required for the data source type
|
|
259
|
+
if self.type in prop.required_for:
|
|
260
|
+
if prop.name not in self.properties:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"'{prop.name}' is required for {self.type} data source"
|
|
263
|
+
)
|
|
264
|
+
# Check the property is not allowed for the data source type
|
|
265
|
+
elif self.type in prop.not_allowed_for and prop.name in self.properties:
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"'{prop.name}' is not allowed for {self.type} data source"
|
|
268
|
+
)
|
|
269
|
+
return self
|
|
270
|
+
|
|
271
|
+
@model_validator(mode="after")
|
|
272
|
+
def validate_no_empty_properties(self) -> Self:
|
|
273
|
+
for prop, value in self.properties.items():
|
|
274
|
+
if isinstance(value, str) and value == "":
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Property '{prop}' must be a non-empty string for {self.type} data source"
|
|
277
|
+
)
|
|
278
|
+
return self
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class TaskOutput(KilnBaseModel):
|
|
282
|
+
"""
|
|
283
|
+
An output for a specific task run.
|
|
284
|
+
|
|
285
|
+
Contains the actual output content, its source (human or synthetic),
|
|
286
|
+
and optional rating information.
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
output: str = Field(
|
|
290
|
+
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
291
|
+
)
|
|
292
|
+
source: DataSource | None = Field(
|
|
293
|
+
description="The source of the output: human or synthetic.",
|
|
294
|
+
default=None,
|
|
295
|
+
)
|
|
296
|
+
rating: TaskOutputRating | None = Field(
|
|
297
|
+
default=None, description="The rating of the output"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
def validate_output_format(self, task: "Task") -> Self:
|
|
301
|
+
# validate output
|
|
302
|
+
if task.output_json_schema is not None:
|
|
303
|
+
try:
|
|
304
|
+
validate_schema(json.loads(self.output), task.output_json_schema)
|
|
305
|
+
except json.JSONDecodeError:
|
|
306
|
+
raise ValueError("Output is not a valid JSON object")
|
|
307
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
308
|
+
raise ValueError(f"Output does not match task output schema: {e}")
|
|
309
|
+
return self
|
|
310
|
+
|
|
311
|
+
@model_validator(mode="after")
|
|
312
|
+
def validate_output_source(self, info: ValidationInfo) -> Self:
|
|
313
|
+
# On strict mode and not loaded from file, we validate output_source is not None.
|
|
314
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
315
|
+
if not strict_mode():
|
|
316
|
+
return self
|
|
317
|
+
if self.loaded_from_file(info):
|
|
318
|
+
return self
|
|
319
|
+
if self.source is None:
|
|
320
|
+
raise ValueError("Output source is required when strict mode is enabled")
|
|
321
|
+
return self
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
import jsonschema
|
|
5
|
+
import jsonschema.exceptions
|
|
6
|
+
from pydantic import Field, ValidationInfo, model_validator
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from kiln_ai.datamodel.basemodel import KilnParentedModel
|
|
10
|
+
from kiln_ai.datamodel.json_schema import validate_schema
|
|
11
|
+
from kiln_ai.datamodel.strict_mode import strict_mode
|
|
12
|
+
from kiln_ai.datamodel.task_output import DataSource, TaskOutput
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from kiln_ai.datamodel.task import Task
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TaskRun(KilnParentedModel):
|
|
19
|
+
"""
|
|
20
|
+
Represents a single execution of a Task.
|
|
21
|
+
|
|
22
|
+
Contains the input used, its source, the output produced, and optional
|
|
23
|
+
repair information if the output needed correction.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
input: str = Field(
|
|
27
|
+
description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
28
|
+
)
|
|
29
|
+
input_source: DataSource | None = Field(
|
|
30
|
+
default=None, description="The source of the input: human or synthetic."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
output: TaskOutput = Field(description="The output of the task run.")
|
|
34
|
+
repair_instructions: str | None = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
|
|
37
|
+
)
|
|
38
|
+
repaired_output: TaskOutput | None = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
|
|
41
|
+
)
|
|
42
|
+
intermediate_outputs: Dict[str, str] | None = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
description="Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data.",
|
|
45
|
+
)
|
|
46
|
+
tags: List[str] = Field(
|
|
47
|
+
default=[],
|
|
48
|
+
description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def has_thinking_training_data(self) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Does this run have thinking data that we can use to train a thinking model?
|
|
54
|
+
"""
|
|
55
|
+
if self.intermediate_outputs is None:
|
|
56
|
+
return False
|
|
57
|
+
return (
|
|
58
|
+
"chain_of_thought" in self.intermediate_outputs
|
|
59
|
+
or "reasoning" in self.intermediate_outputs
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Workaround to return typed parent without importing Task
|
|
63
|
+
def parent_task(self) -> Union["Task", None]:
|
|
64
|
+
if self.parent is None or self.parent.__class__.__name__ != "Task":
|
|
65
|
+
return None
|
|
66
|
+
return self.parent # type: ignore
|
|
67
|
+
|
|
68
|
+
@model_validator(mode="after")
|
|
69
|
+
def validate_input_format(self, info: ValidationInfo) -> Self:
|
|
70
|
+
# Don't validate if loading from file (not new). Too slow.
|
|
71
|
+
# We don't allow changing task schema, so this is redundant validation.
|
|
72
|
+
# Note: we still validate if editing a loaded model
|
|
73
|
+
if self.loading_from_file(info):
|
|
74
|
+
# Consider loading an existing model as validated.
|
|
75
|
+
self._last_validated_input = self.input
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
# Don't validate if input has not changed. Too slow to run this every time.
|
|
79
|
+
if (
|
|
80
|
+
hasattr(self, "_last_validated_input")
|
|
81
|
+
and self.input == self._last_validated_input
|
|
82
|
+
):
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
task = self.parent_task()
|
|
86
|
+
if task is None:
|
|
87
|
+
# don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
# validate output
|
|
91
|
+
if task.input_json_schema is not None:
|
|
92
|
+
try:
|
|
93
|
+
validate_schema(json.loads(self.input), task.input_json_schema)
|
|
94
|
+
except json.JSONDecodeError:
|
|
95
|
+
raise ValueError("Input is not a valid JSON object")
|
|
96
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
97
|
+
raise ValueError(f"Input does not match task input schema: {e}")
|
|
98
|
+
self._last_validated_input = self.input
|
|
99
|
+
return self
|
|
100
|
+
|
|
101
|
+
@model_validator(mode="after")
|
|
102
|
+
def validate_output_format(self, info: ValidationInfo) -> Self:
|
|
103
|
+
# Don't validate if loading from file (not new). Too slow.
|
|
104
|
+
# Note: we still validate if editing a loaded model's output.
|
|
105
|
+
if self.loading_from_file(info):
|
|
106
|
+
# Consider loading an existing model as validated.
|
|
107
|
+
self._last_validated_output = self.output.output if self.output else None
|
|
108
|
+
return self
|
|
109
|
+
|
|
110
|
+
# Don't validate unless output has changed since last validation.
|
|
111
|
+
# The validator is slow and costly, don't want it running when setting other fields.
|
|
112
|
+
if (
|
|
113
|
+
hasattr(self, "_last_validated_output")
|
|
114
|
+
and self.output is not None
|
|
115
|
+
and self.output.output == self._last_validated_output
|
|
116
|
+
):
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
task = self.parent_task()
|
|
120
|
+
if task is None:
|
|
121
|
+
return self
|
|
122
|
+
|
|
123
|
+
self.output.validate_output_format(task)
|
|
124
|
+
self._last_validated_output = self.output.output if self.output else None
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
@model_validator(mode="after")
|
|
128
|
+
def validate_repaired_output(self) -> Self:
|
|
129
|
+
if self.repaired_output is not None:
|
|
130
|
+
if self.repaired_output.rating is not None:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"Repaired output rating must be None. Repaired outputs are assumed to have a perfect rating, as they have been fixed."
|
|
133
|
+
)
|
|
134
|
+
if self.repair_instructions is None and self.repaired_output is not None:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
"Repair instructions are required if providing a repaired output."
|
|
137
|
+
)
|
|
138
|
+
if self.repair_instructions is not None and self.repaired_output is None:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
"A repaired output is required if providing repair instructions."
|
|
141
|
+
)
|
|
142
|
+
return self
|
|
143
|
+
|
|
144
|
+
@model_validator(mode="after")
|
|
145
|
+
def validate_input_source(self, info: ValidationInfo) -> Self:
|
|
146
|
+
# On strict mode and not loaded from file, we validate input_source is not None.
|
|
147
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
148
|
+
if not strict_mode():
|
|
149
|
+
return self
|
|
150
|
+
if self.loaded_from_file(info):
|
|
151
|
+
return self
|
|
152
|
+
if self.input_source is None:
|
|
153
|
+
raise ValueError("input_source is required when strict mode is enabled")
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
@model_validator(mode="after")
|
|
157
|
+
def validate_tags(self) -> Self:
|
|
158
|
+
for tag in self.tags:
|
|
159
|
+
if not tag:
|
|
160
|
+
raise ValueError("Tags cannot be empty strings")
|
|
161
|
+
if " " in tag:
|
|
162
|
+
raise ValueError("Tags cannot contain spaces. Try underscores.")
|
|
163
|
+
|
|
164
|
+
return self
|