kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/adapter_registry.py +12 -13
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +141 -29
- kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
- kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +3 -3
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +10 -10
- kiln_ai/adapters/test_generate_docs.py +6 -6
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +17 -14
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +6 -0
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +10 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from enum import Enum, IntEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Priority(IntEnum):
|
|
5
|
+
"""Defines priority levels for tasks and requirements, where P0 is highest priority."""
|
|
6
|
+
|
|
7
|
+
p0 = 0
|
|
8
|
+
p1 = 1
|
|
9
|
+
p2 = 2
|
|
10
|
+
p3 = 3
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Only one rating type for now, but this allows for extensibility if we want to add more in the future
|
|
14
|
+
class TaskOutputRatingType(str, Enum):
|
|
15
|
+
"""Defines the types of rating systems available for task outputs."""
|
|
16
|
+
|
|
17
|
+
five_star = "five_star"
|
|
18
|
+
pass_fail = "pass_fail"
|
|
19
|
+
pass_fail_critical = "pass_fail_critical"
|
|
20
|
+
custom = "custom"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class StructuredOutputMode(str, Enum):
|
|
24
|
+
"""
|
|
25
|
+
Enumeration of supported structured output modes.
|
|
26
|
+
|
|
27
|
+
- default: let the adapter decide
|
|
28
|
+
- json_schema: request json using API capabilities for json_schema
|
|
29
|
+
- function_calling: request json using API capabilities for function calling
|
|
30
|
+
- json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
|
|
31
|
+
- json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
|
|
32
|
+
- json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
default = "default"
|
|
36
|
+
json_schema = "json_schema"
|
|
37
|
+
function_calling_weak = "function_calling_weak"
|
|
38
|
+
function_calling = "function_calling"
|
|
39
|
+
json_mode = "json_mode"
|
|
40
|
+
json_instructions = "json_instructions"
|
|
41
|
+
json_instruction_and_object = "json_instruction_and_object"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class FineTuneStatusType(str, Enum):
|
|
45
|
+
"""
|
|
46
|
+
The status type of a fine-tune (running, completed, failed, etc).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
unknown = "unknown" # server error
|
|
50
|
+
pending = "pending"
|
|
51
|
+
running = "running"
|
|
52
|
+
completed = "completed"
|
|
53
|
+
failed = "failed"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FinetuneDataStrategy(str, Enum):
|
|
57
|
+
final_only = "final_only"
|
|
58
|
+
final_and_intermediate = "final_and_intermediate"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Annotated, Protocol
|
|
3
|
+
|
|
4
|
+
from pydantic import AfterValidator
|
|
5
|
+
|
|
6
|
+
from kiln_ai.datamodel.task_run import TaskRun
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DatasetFilter(Protocol):
|
|
10
|
+
"""A protocol defining the interface for dataset filters.
|
|
11
|
+
|
|
12
|
+
This allows both stateless function-based filters and stateful class-based filters
|
|
13
|
+
to be used interchangeably, as long as they implement the __call__ method.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __call__(self, task_run: TaskRun) -> bool:
|
|
17
|
+
"""Return True if the task run should be included in the dataset."""
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def AllDatasetFilter(_: TaskRun) -> bool:
|
|
22
|
+
return True
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
|
|
26
|
+
if task_run.output is None:
|
|
27
|
+
return False
|
|
28
|
+
if task_run.repaired_output is not None:
|
|
29
|
+
# Repairs always considered high quality
|
|
30
|
+
return True
|
|
31
|
+
if task_run.output.rating is None:
|
|
32
|
+
return False
|
|
33
|
+
return task_run.output.rating.is_high_quality()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought)
|
|
39
|
+
"""
|
|
40
|
+
return task_run.has_thinking_training_data()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
A filter that returns True if the task has thinking data and the output is high quality
|
|
46
|
+
"""
|
|
47
|
+
return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TagFilter:
|
|
51
|
+
"""
|
|
52
|
+
A filter that returns True if the task has a tag matching the given tag.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, tag: str):
|
|
56
|
+
self.tag = tag
|
|
57
|
+
|
|
58
|
+
def __call__(self, task_run: TaskRun) -> bool:
|
|
59
|
+
return self.tag in task_run.tags
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class StaticDatasetFilters(str, Enum):
|
|
63
|
+
"""Dataset filter names."""
|
|
64
|
+
|
|
65
|
+
ALL = "all"
|
|
66
|
+
HIGH_RATING = "high_rating"
|
|
67
|
+
THINKING_MODEL = "thinking_model"
|
|
68
|
+
THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
static_dataset_filters = {
|
|
72
|
+
StaticDatasetFilters.ALL: AllDatasetFilter,
|
|
73
|
+
StaticDatasetFilters.HIGH_RATING: HighRatingDatasetFilter,
|
|
74
|
+
StaticDatasetFilters.THINKING_MODEL: ThinkingModelDatasetFilter,
|
|
75
|
+
StaticDatasetFilters.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
DatasetFilterId = Annotated[
|
|
79
|
+
str,
|
|
80
|
+
AfterValidator(lambda v: _check_dataset_filter_id(v)),
|
|
81
|
+
]
|
|
82
|
+
"""
|
|
83
|
+
A pydantic type that validates strings containing a valid dataset filter ID.
|
|
84
|
+
|
|
85
|
+
Dataset filter IDs can be one of:
|
|
86
|
+
- A built-in dataset filter name
|
|
87
|
+
- A tag::<tag> filter, where <tag> is a string
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _check_dataset_filter_id(id: str) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Check that the dataset filter ID is valid.
|
|
94
|
+
"""
|
|
95
|
+
if id in static_dataset_filters:
|
|
96
|
+
return id
|
|
97
|
+
|
|
98
|
+
if id.startswith("tag::") and len(id) > 5:
|
|
99
|
+
return id
|
|
100
|
+
|
|
101
|
+
raise ValueError(f"Invalid dataset filter ID: {id}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter:
|
|
105
|
+
"""
|
|
106
|
+
Get a dataset filter from an ID.
|
|
107
|
+
"""
|
|
108
|
+
if id.startswith("tag::") and len(id) > 5:
|
|
109
|
+
return TagFilter(id[5:])
|
|
110
|
+
|
|
111
|
+
if id in static_dataset_filters:
|
|
112
|
+
return static_dataset_filters[id]
|
|
113
|
+
|
|
114
|
+
raise ValueError(f"Invalid dataset filter ID: {id}")
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tools for splitting datasets into train/test/validation splits. Includes filters for selecting which task runs to include in each split.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import random
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
|
|
12
|
+
from kiln_ai.datamodel.dataset_filters import (
|
|
13
|
+
DatasetFilter,
|
|
14
|
+
DatasetFilterId,
|
|
15
|
+
dataset_filter_from_id,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from kiln_ai.datamodel.task import Task
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DatasetSplitDefinition(BaseModel):
|
|
23
|
+
"""
|
|
24
|
+
A definition of a split in a dataset.
|
|
25
|
+
|
|
26
|
+
Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name: str = NAME_FIELD
|
|
30
|
+
description: str | None = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="A description of the dataset for you and your team. Not used in training.",
|
|
33
|
+
)
|
|
34
|
+
percentage: float = Field(
|
|
35
|
+
ge=0.0,
|
|
36
|
+
le=1.0,
|
|
37
|
+
description="The percentage of the dataset that this split represents (between 0 and 1).",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
AllSplitDefinition: list[DatasetSplitDefinition] = [
|
|
42
|
+
DatasetSplitDefinition(name="all", percentage=1.0)
|
|
43
|
+
]
|
|
44
|
+
Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
45
|
+
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
46
|
+
DatasetSplitDefinition(name="test", percentage=0.2),
|
|
47
|
+
]
|
|
48
|
+
Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
49
|
+
DatasetSplitDefinition(name="train", percentage=0.6),
|
|
50
|
+
DatasetSplitDefinition(name="test", percentage=0.2),
|
|
51
|
+
DatasetSplitDefinition(name="val", percentage=0.2),
|
|
52
|
+
]
|
|
53
|
+
Train80Test10Val10SplitDefinition: list[DatasetSplitDefinition] = [
|
|
54
|
+
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
55
|
+
DatasetSplitDefinition(name="test", percentage=0.1),
|
|
56
|
+
DatasetSplitDefinition(name="val", percentage=0.1),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DatasetSplit(KilnParentedModel):
|
|
61
|
+
"""
|
|
62
|
+
A collection of task runs, with optional splits (train, test, validation).
|
|
63
|
+
|
|
64
|
+
Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
|
|
65
|
+
|
|
66
|
+
Maintains a list of IDs for each split, to avoid data duplication.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
name: str = NAME_FIELD
|
|
70
|
+
description: str | None = Field(
|
|
71
|
+
default=None,
|
|
72
|
+
description="A description of the dataset for you and your team. Not used in training.",
|
|
73
|
+
)
|
|
74
|
+
splits: list[DatasetSplitDefinition] = Field(
|
|
75
|
+
default_factory=list,
|
|
76
|
+
description="The splits in the dataset.",
|
|
77
|
+
)
|
|
78
|
+
split_contents: dict[str, list[str]] = Field(
|
|
79
|
+
description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
|
|
80
|
+
)
|
|
81
|
+
filter: DatasetFilterId | None = Field(
|
|
82
|
+
default=None,
|
|
83
|
+
description="The filter used to build the dataset.",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@model_validator(mode="after")
|
|
87
|
+
def validate_split_percentages(self) -> "DatasetSplit":
|
|
88
|
+
total = sum(split.percentage for split in self.splits)
|
|
89
|
+
if not math.isclose(total, 1.0, rel_tol=1e-9):
|
|
90
|
+
raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
|
|
91
|
+
return self
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_task(
|
|
95
|
+
cls,
|
|
96
|
+
name: str,
|
|
97
|
+
task: "Task",
|
|
98
|
+
splits: list[DatasetSplitDefinition],
|
|
99
|
+
filter_id: DatasetFilterId = "all",
|
|
100
|
+
description: str | None = None,
|
|
101
|
+
):
|
|
102
|
+
"""
|
|
103
|
+
Build a dataset split from a task.
|
|
104
|
+
"""
|
|
105
|
+
filter = dataset_filter_from_id(filter_id)
|
|
106
|
+
split_contents = cls.build_split_contents(task, splits, filter)
|
|
107
|
+
return cls(
|
|
108
|
+
parent=task,
|
|
109
|
+
name=name,
|
|
110
|
+
description=description,
|
|
111
|
+
splits=splits,
|
|
112
|
+
split_contents=split_contents,
|
|
113
|
+
filter=filter_id,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def build_split_contents(
|
|
118
|
+
cls,
|
|
119
|
+
task: "Task",
|
|
120
|
+
splits: list[DatasetSplitDefinition],
|
|
121
|
+
filter: DatasetFilter,
|
|
122
|
+
) -> dict[str, list[str]]:
|
|
123
|
+
valid_ids = []
|
|
124
|
+
for task_run in task.runs():
|
|
125
|
+
if filter(task_run):
|
|
126
|
+
valid_ids.append(task_run.id)
|
|
127
|
+
|
|
128
|
+
# Shuffle and split by split percentage
|
|
129
|
+
random.shuffle(valid_ids)
|
|
130
|
+
split_contents = {}
|
|
131
|
+
start_idx = 0
|
|
132
|
+
remaining_items = len(valid_ids)
|
|
133
|
+
|
|
134
|
+
# Handle all splits except the last one
|
|
135
|
+
for split in splits[:-1]:
|
|
136
|
+
split_size = round(len(valid_ids) * split.percentage)
|
|
137
|
+
split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
|
|
138
|
+
start_idx += split_size
|
|
139
|
+
remaining_items -= split_size
|
|
140
|
+
|
|
141
|
+
# Last split gets all remaining items (for rounding)
|
|
142
|
+
if splits:
|
|
143
|
+
split_contents[splits[-1].name] = valid_ids[start_idx:]
|
|
144
|
+
|
|
145
|
+
return split_contents
|
|
146
|
+
|
|
147
|
+
def parent_task(self) -> "Task | None":
|
|
148
|
+
# inline import to avoid circular import
|
|
149
|
+
from kiln_ai.datamodel import Task
|
|
150
|
+
|
|
151
|
+
if not isinstance(self.parent, Task):
|
|
152
|
+
return None
|
|
153
|
+
return self.parent
|
|
154
|
+
|
|
155
|
+
def missing_count(self) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Returns:
|
|
158
|
+
int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
|
|
159
|
+
"""
|
|
160
|
+
parent = self.parent_task()
|
|
161
|
+
if parent is None:
|
|
162
|
+
raise ValueError("DatasetSplit has no parent task")
|
|
163
|
+
|
|
164
|
+
runs = parent.runs(readonly=True)
|
|
165
|
+
all_ids = set(run.id for run in runs)
|
|
166
|
+
all_ids_in_splits = set()
|
|
167
|
+
for ids in self.split_contents.values():
|
|
168
|
+
all_ids_in_splits.update(ids)
|
|
169
|
+
missing = all_ids_in_splits - all_ids
|
|
170
|
+
return len(missing)
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from kiln_ai.datamodel.basemodel import (
|
|
9
|
+
ID_TYPE,
|
|
10
|
+
NAME_FIELD,
|
|
11
|
+
KilnParentedModel,
|
|
12
|
+
KilnParentModel,
|
|
13
|
+
)
|
|
14
|
+
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
|
|
15
|
+
from kiln_ai.datamodel.dataset_filters import DatasetFilterId
|
|
16
|
+
from kiln_ai.datamodel.json_schema import string_to_json_key
|
|
17
|
+
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from kiln_ai.datamodel.task import Task
|
|
21
|
+
|
|
22
|
+
EvalScores = Dict[str, float]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class EvalTemplateId(str, Enum):
|
|
26
|
+
"""
|
|
27
|
+
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
kiln_requirements = "kiln_requirements"
|
|
31
|
+
toxicity = "toxicity"
|
|
32
|
+
bias = "bias"
|
|
33
|
+
maliciousness = "maliciousness"
|
|
34
|
+
factual_correctness = "factual_correctness"
|
|
35
|
+
jailbreak = "jailbreak"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvalConfigType(str, Enum):
|
|
39
|
+
g_eval = "g_eval"
|
|
40
|
+
llm_as_judge = "llm_as_judge"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class EvalOutputScore(BaseModel):
|
|
44
|
+
"""
|
|
45
|
+
A definition of a score that an evaluator will produce.
|
|
46
|
+
|
|
47
|
+
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
name: str = Field(
|
|
51
|
+
description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
|
|
52
|
+
)
|
|
53
|
+
instruction: str | None = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
|
|
56
|
+
)
|
|
57
|
+
type: TaskOutputRatingType = Field(
|
|
58
|
+
description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def json_key(self) -> str:
|
|
62
|
+
"""
|
|
63
|
+
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
|
|
64
|
+
|
|
65
|
+
For example, "Overall Rating" -> "overall_rating"
|
|
66
|
+
"""
|
|
67
|
+
return string_to_json_key(self.name)
|
|
68
|
+
|
|
69
|
+
@model_validator(mode="after")
|
|
70
|
+
def validate_type(self) -> Self:
|
|
71
|
+
if self.type == TaskOutputRatingType.custom:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
|
|
74
|
+
)
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class EvalRun(KilnParentedModel):
|
|
79
|
+
"""
|
|
80
|
+
The results of running an eval on a single dataset item.
|
|
81
|
+
|
|
82
|
+
This is a child of an EvalConfig, which specifies how the scores were generated.
|
|
83
|
+
|
|
84
|
+
Eval runs can be one of 2 types:
|
|
85
|
+
1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
|
|
86
|
+
2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
dataset_id: ID_TYPE = Field(
|
|
90
|
+
description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
|
|
91
|
+
)
|
|
92
|
+
task_run_config_id: ID_TYPE | None = Field(
|
|
93
|
+
description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
|
|
94
|
+
)
|
|
95
|
+
eval_config_eval: bool = Field(
|
|
96
|
+
description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
|
|
97
|
+
default=False,
|
|
98
|
+
)
|
|
99
|
+
# These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
|
|
100
|
+
input: str = Field(
|
|
101
|
+
description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
102
|
+
)
|
|
103
|
+
output: str = Field(
|
|
104
|
+
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
105
|
+
)
|
|
106
|
+
intermediate_outputs: Dict[str, str] | None = Field(
|
|
107
|
+
default=None,
|
|
108
|
+
description="The intermediate outputs of the task (example, eval thinking).",
|
|
109
|
+
)
|
|
110
|
+
scores: EvalScores = Field(
|
|
111
|
+
description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def parent_eval_config(self) -> Union["EvalConfig", None]:
|
|
115
|
+
if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
|
|
116
|
+
raise ValueError("parent must be an EvalConfig")
|
|
117
|
+
return self.parent # type: ignore
|
|
118
|
+
|
|
119
|
+
@model_validator(mode="after")
|
|
120
|
+
def validate_eval_run_types(self) -> Self:
|
|
121
|
+
if self.eval_config_eval and self.task_run_config_id is not None:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"task_run_config_id must be None if eval_config_eval is true"
|
|
124
|
+
)
|
|
125
|
+
if not self.eval_config_eval and self.task_run_config_id is None:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
"task_run_config_id must be set if eval_config_eval is false"
|
|
128
|
+
)
|
|
129
|
+
return self
|
|
130
|
+
|
|
131
|
+
@model_validator(mode="after")
|
|
132
|
+
def validate_scores(self) -> Self:
|
|
133
|
+
# We're checking the scores have the expected keys from the grand-parent eval
|
|
134
|
+
if self.scores is None or len(self.scores) == 0:
|
|
135
|
+
raise ValueError("scores are required, and must have at least one score.")
|
|
136
|
+
|
|
137
|
+
parent_eval_config = self.parent_eval_config()
|
|
138
|
+
eval = parent_eval_config.parent_eval() if parent_eval_config else None
|
|
139
|
+
if not eval:
|
|
140
|
+
# Can't validate without the grand-parent eval, allow it to be validated later
|
|
141
|
+
return self
|
|
142
|
+
|
|
143
|
+
output_score_keys = [score.json_key() for score in eval.output_scores]
|
|
144
|
+
if set(output_score_keys) != set(self.scores.keys()):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Check that each score is expected in this eval and the correct type
|
|
150
|
+
for output_score in eval.output_scores:
|
|
151
|
+
match output_score.type:
|
|
152
|
+
case TaskOutputRatingType.five_star:
|
|
153
|
+
five_star_score = self.scores[output_score.json_key()]
|
|
154
|
+
if (
|
|
155
|
+
not isinstance(five_star_score, float)
|
|
156
|
+
or five_star_score < 1.0
|
|
157
|
+
or five_star_score > 5.0
|
|
158
|
+
):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
|
|
161
|
+
)
|
|
162
|
+
case TaskOutputRatingType.pass_fail:
|
|
163
|
+
pass_fail_score = self.scores[output_score.json_key()]
|
|
164
|
+
if (
|
|
165
|
+
not isinstance(pass_fail_score, float)
|
|
166
|
+
or pass_fail_score < 0.0
|
|
167
|
+
or pass_fail_score > 1.0
|
|
168
|
+
):
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
|
|
171
|
+
)
|
|
172
|
+
case TaskOutputRatingType.pass_fail_critical:
|
|
173
|
+
pass_fail_critical_score = self.scores[output_score.json_key()]
|
|
174
|
+
if (
|
|
175
|
+
not isinstance(pass_fail_critical_score, float)
|
|
176
|
+
or pass_fail_critical_score < -1.0
|
|
177
|
+
or pass_fail_critical_score > 1.0
|
|
178
|
+
):
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
|
|
181
|
+
)
|
|
182
|
+
case TaskOutputRatingType.custom:
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
|
|
185
|
+
)
|
|
186
|
+
case _:
|
|
187
|
+
# Catch missing cases
|
|
188
|
+
raise_exhaustive_enum_error(output_score.type)
|
|
189
|
+
return self
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
|
|
193
|
+
"""
|
|
194
|
+
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
|
|
195
|
+
|
|
196
|
+
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
name: str = NAME_FIELD
|
|
200
|
+
model_name: str = Field(
|
|
201
|
+
description="The name of the model to use for this eval config. ",
|
|
202
|
+
)
|
|
203
|
+
model_provider: str = Field(
|
|
204
|
+
description="The provider of the model to use for this eval config.",
|
|
205
|
+
)
|
|
206
|
+
config_type: EvalConfigType = Field(
|
|
207
|
+
default=EvalConfigType.g_eval,
|
|
208
|
+
description="This is used to determine the type of eval to run.",
|
|
209
|
+
)
|
|
210
|
+
properties: dict[str, Any] = Field(
|
|
211
|
+
default={},
|
|
212
|
+
description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def parent_eval(self) -> Union["Eval", None]:
|
|
216
|
+
if self.parent is not None and self.parent.__class__.__name__ != "Eval":
|
|
217
|
+
raise ValueError("parent must be an Eval")
|
|
218
|
+
return self.parent # type: ignore
|
|
219
|
+
|
|
220
|
+
def runs(self, readonly: bool = False) -> list[EvalRun]:
|
|
221
|
+
return super().runs(readonly=readonly) # type: ignore
|
|
222
|
+
|
|
223
|
+
@model_validator(mode="after")
|
|
224
|
+
def validate_properties(self) -> Self:
|
|
225
|
+
if (
|
|
226
|
+
self.config_type == EvalConfigType.g_eval
|
|
227
|
+
or self.config_type == EvalConfigType.llm_as_judge
|
|
228
|
+
):
|
|
229
|
+
if "eval_steps" not in self.properties or not isinstance(
|
|
230
|
+
self.properties["eval_steps"], list
|
|
231
|
+
):
|
|
232
|
+
raise ValueError("eval_steps is required and must be a list for g_eval")
|
|
233
|
+
if "task_description" in self.properties and not isinstance(
|
|
234
|
+
self.properties["task_description"], str
|
|
235
|
+
):
|
|
236
|
+
raise ValueError(
|
|
237
|
+
"task_description is optional, but if provided must be a string"
|
|
238
|
+
)
|
|
239
|
+
return self
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError(f"Invalid eval config type: {self.config_type}")
|
|
242
|
+
|
|
243
|
+
@model_validator(mode="after")
|
|
244
|
+
def validate_json_serializable(self) -> "EvalConfig":
|
|
245
|
+
try:
|
|
246
|
+
# This will raise a TypeError if the dict contains non-JSON-serializable objects
|
|
247
|
+
json.dumps(self.properties)
|
|
248
|
+
except TypeError as e:
|
|
249
|
+
raise ValueError(f"Properties must be JSON serializable: {str(e)}")
|
|
250
|
+
return self
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
|
|
254
|
+
name: str = NAME_FIELD
|
|
255
|
+
description: str | None = Field(
|
|
256
|
+
default=None, description="The description of the eval"
|
|
257
|
+
)
|
|
258
|
+
template: EvalTemplateId | None = Field(
|
|
259
|
+
default=None,
|
|
260
|
+
description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
|
|
261
|
+
)
|
|
262
|
+
current_config_id: ID_TYPE = Field(
|
|
263
|
+
default=None,
|
|
264
|
+
description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
|
|
265
|
+
)
|
|
266
|
+
eval_set_filter_id: DatasetFilterId = Field(
|
|
267
|
+
description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
|
|
268
|
+
)
|
|
269
|
+
eval_configs_filter_id: DatasetFilterId = Field(
|
|
270
|
+
description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
|
|
271
|
+
)
|
|
272
|
+
output_scores: List[EvalOutputScore] = Field(
|
|
273
|
+
description="The scores this evaluator should produce."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Workaround to return typed parent without importing Task
|
|
277
|
+
def parent_task(self) -> Union["Task", None]:
|
|
278
|
+
if self.parent is not None and self.parent.__class__.__name__ != "Task":
|
|
279
|
+
raise ValueError("parent must be a Task")
|
|
280
|
+
return self.parent # type: ignore
|
|
281
|
+
|
|
282
|
+
def configs(self, readonly: bool = False) -> list[EvalConfig]:
|
|
283
|
+
return super().configs(readonly=readonly) # type: ignore
|
|
284
|
+
|
|
285
|
+
@model_validator(mode="after")
|
|
286
|
+
def validate_scores(self) -> Self:
|
|
287
|
+
if self.output_scores is None or len(self.output_scores) == 0:
|
|
288
|
+
raise ValueError(
|
|
289
|
+
"output_scores are required, and must have at least one score."
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# check for duplicate names (once transformed to JSON keys)
|
|
293
|
+
output_score_keys = [score.json_key() for score in self.output_scores]
|
|
294
|
+
if len(output_score_keys) != len(set(output_score_keys)):
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
|
|
297
|
+
)
|
|
298
|
+
return self
|