kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (63) hide show
  1. kiln_ai/adapters/adapter_registry.py +12 -13
  2. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  3. kiln_ai/adapters/eval/base_eval.py +164 -0
  4. kiln_ai/adapters/eval/eval_runner.py +267 -0
  5. kiln_ai/adapters/eval/g_eval.py +367 -0
  6. kiln_ai/adapters/eval/registry.py +16 -0
  7. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  8. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  9. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  10. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  11. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  12. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  13. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  14. kiln_ai/adapters/ml_model_list.py +141 -29
  15. kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
  16. kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
  17. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
  18. kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
  19. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  20. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
  21. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
  22. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  23. kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
  24. kiln_ai/adapters/ollama_tools.py +0 -1
  25. kiln_ai/adapters/prompt_builders.py +80 -42
  26. kiln_ai/adapters/repair/repair_task.py +9 -21
  27. kiln_ai/adapters/repair/test_repair_task.py +3 -3
  28. kiln_ai/adapters/run_output.py +3 -0
  29. kiln_ai/adapters/test_adapter_registry.py +10 -10
  30. kiln_ai/adapters/test_generate_docs.py +6 -6
  31. kiln_ai/adapters/test_ollama_tools.py +0 -1
  32. kiln_ai/adapters/test_prompt_adaptors.py +17 -14
  33. kiln_ai/adapters/test_prompt_builders.py +91 -31
  34. kiln_ai/datamodel/__init__.py +50 -952
  35. kiln_ai/datamodel/datamodel_enums.py +58 -0
  36. kiln_ai/datamodel/dataset_filters.py +114 -0
  37. kiln_ai/datamodel/dataset_split.py +170 -0
  38. kiln_ai/datamodel/eval.py +298 -0
  39. kiln_ai/datamodel/finetune.py +105 -0
  40. kiln_ai/datamodel/json_schema.py +6 -0
  41. kiln_ai/datamodel/project.py +23 -0
  42. kiln_ai/datamodel/prompt.py +37 -0
  43. kiln_ai/datamodel/prompt_id.py +83 -0
  44. kiln_ai/datamodel/strict_mode.py +24 -0
  45. kiln_ai/datamodel/task.py +181 -0
  46. kiln_ai/datamodel/task_output.py +321 -0
  47. kiln_ai/datamodel/task_run.py +164 -0
  48. kiln_ai/datamodel/test_basemodel.py +10 -11
  49. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  50. kiln_ai/datamodel/test_dataset_split.py +32 -8
  51. kiln_ai/datamodel/test_datasource.py +3 -2
  52. kiln_ai/datamodel/test_eval_model.py +635 -0
  53. kiln_ai/datamodel/test_example_models.py +9 -13
  54. kiln_ai/datamodel/test_json_schema.py +23 -0
  55. kiln_ai/datamodel/test_models.py +2 -2
  56. kiln_ai/datamodel/test_prompt_id.py +129 -0
  57. kiln_ai/datamodel/test_task.py +159 -0
  58. kiln_ai/utils/config.py +6 -1
  59. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
  60. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  61. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  62. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  63. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,58 @@
1
+ from enum import Enum, IntEnum
2
+
3
+
4
+ class Priority(IntEnum):
5
+ """Defines priority levels for tasks and requirements, where P0 is highest priority."""
6
+
7
+ p0 = 0
8
+ p1 = 1
9
+ p2 = 2
10
+ p3 = 3
11
+
12
+
13
+ # Only one rating type for now, but this allows for extensibility if we want to add more in the future
14
+ class TaskOutputRatingType(str, Enum):
15
+ """Defines the types of rating systems available for task outputs."""
16
+
17
+ five_star = "five_star"
18
+ pass_fail = "pass_fail"
19
+ pass_fail_critical = "pass_fail_critical"
20
+ custom = "custom"
21
+
22
+
23
+ class StructuredOutputMode(str, Enum):
24
+ """
25
+ Enumeration of supported structured output modes.
26
+
27
+ - default: let the adapter decide
28
+ - json_schema: request json using API capabilities for json_schema
29
+ - function_calling: request json using API capabilities for function calling
30
+ - json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
31
+ - json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
32
+ - json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
33
+ """
34
+
35
+ default = "default"
36
+ json_schema = "json_schema"
37
+ function_calling_weak = "function_calling_weak"
38
+ function_calling = "function_calling"
39
+ json_mode = "json_mode"
40
+ json_instructions = "json_instructions"
41
+ json_instruction_and_object = "json_instruction_and_object"
42
+
43
+
44
+ class FineTuneStatusType(str, Enum):
45
+ """
46
+ The status type of a fine-tune (running, completed, failed, etc).
47
+ """
48
+
49
+ unknown = "unknown" # server error
50
+ pending = "pending"
51
+ running = "running"
52
+ completed = "completed"
53
+ failed = "failed"
54
+
55
+
56
+ class FinetuneDataStrategy(str, Enum):
57
+ final_only = "final_only"
58
+ final_and_intermediate = "final_and_intermediate"
@@ -0,0 +1,114 @@
1
+ from enum import Enum
2
+ from typing import Annotated, Protocol
3
+
4
+ from pydantic import AfterValidator
5
+
6
+ from kiln_ai.datamodel.task_run import TaskRun
7
+
8
+
9
+ class DatasetFilter(Protocol):
10
+ """A protocol defining the interface for dataset filters.
11
+
12
+ This allows both stateless function-based filters and stateful class-based filters
13
+ to be used interchangeably, as long as they implement the __call__ method.
14
+ """
15
+
16
+ def __call__(self, task_run: TaskRun) -> bool:
17
+ """Return True if the task run should be included in the dataset."""
18
+ ...
19
+
20
+
21
+ def AllDatasetFilter(_: TaskRun) -> bool:
22
+ return True
23
+
24
+
25
+ def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
26
+ if task_run.output is None:
27
+ return False
28
+ if task_run.repaired_output is not None:
29
+ # Repairs always considered high quality
30
+ return True
31
+ if task_run.output.rating is None:
32
+ return False
33
+ return task_run.output.rating.is_high_quality()
34
+
35
+
36
+ def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool:
37
+ """
38
+ A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought)
39
+ """
40
+ return task_run.has_thinking_training_data()
41
+
42
+
43
+ def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool:
44
+ """
45
+ A filter that returns True if the task has thinking data and the output is high quality
46
+ """
47
+ return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run)
48
+
49
+
50
+ class TagFilter:
51
+ """
52
+ A filter that returns True if the task has a tag matching the given tag.
53
+ """
54
+
55
+ def __init__(self, tag: str):
56
+ self.tag = tag
57
+
58
+ def __call__(self, task_run: TaskRun) -> bool:
59
+ return self.tag in task_run.tags
60
+
61
+
62
+ class StaticDatasetFilters(str, Enum):
63
+ """Dataset filter names."""
64
+
65
+ ALL = "all"
66
+ HIGH_RATING = "high_rating"
67
+ THINKING_MODEL = "thinking_model"
68
+ THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated"
69
+
70
+
71
+ static_dataset_filters = {
72
+ StaticDatasetFilters.ALL: AllDatasetFilter,
73
+ StaticDatasetFilters.HIGH_RATING: HighRatingDatasetFilter,
74
+ StaticDatasetFilters.THINKING_MODEL: ThinkingModelDatasetFilter,
75
+ StaticDatasetFilters.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter,
76
+ }
77
+
78
+ DatasetFilterId = Annotated[
79
+ str,
80
+ AfterValidator(lambda v: _check_dataset_filter_id(v)),
81
+ ]
82
+ """
83
+ A pydantic type that validates strings containing a valid dataset filter ID.
84
+
85
+ Dataset filter IDs can be one of:
86
+ - A built-in dataset filter name
87
+ - A tag::<tag> filter, where <tag> is a string
88
+ """
89
+
90
+
91
+ def _check_dataset_filter_id(id: str) -> str:
92
+ """
93
+ Check that the dataset filter ID is valid.
94
+ """
95
+ if id in static_dataset_filters:
96
+ return id
97
+
98
+ if id.startswith("tag::") and len(id) > 5:
99
+ return id
100
+
101
+ raise ValueError(f"Invalid dataset filter ID: {id}")
102
+
103
+
104
+ def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter:
105
+ """
106
+ Get a dataset filter from an ID.
107
+ """
108
+ if id.startswith("tag::") and len(id) > 5:
109
+ return TagFilter(id[5:])
110
+
111
+ if id in static_dataset_filters:
112
+ return static_dataset_filters[id]
113
+
114
+ raise ValueError(f"Invalid dataset filter ID: {id}")
@@ -0,0 +1,170 @@
1
+ """
2
+ Tools for splitting datasets into train/test/validation splits. Includes filters for selecting which task runs to include in each split.
3
+ """
4
+
5
+ import math
6
+ import random
7
+ from typing import TYPE_CHECKING
8
+
9
+ from pydantic import BaseModel, Field, model_validator
10
+
11
+ from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
12
+ from kiln_ai.datamodel.dataset_filters import (
13
+ DatasetFilter,
14
+ DatasetFilterId,
15
+ dataset_filter_from_id,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from kiln_ai.datamodel.task import Task
20
+
21
+
22
+ class DatasetSplitDefinition(BaseModel):
23
+ """
24
+ A definition of a split in a dataset.
25
+
26
+ Example: name="train", description="The training set", percentage=0.8 (80% of the dataset)
27
+ """
28
+
29
+ name: str = NAME_FIELD
30
+ description: str | None = Field(
31
+ default=None,
32
+ description="A description of the dataset for you and your team. Not used in training.",
33
+ )
34
+ percentage: float = Field(
35
+ ge=0.0,
36
+ le=1.0,
37
+ description="The percentage of the dataset that this split represents (between 0 and 1).",
38
+ )
39
+
40
+
41
+ AllSplitDefinition: list[DatasetSplitDefinition] = [
42
+ DatasetSplitDefinition(name="all", percentage=1.0)
43
+ ]
44
+ Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
45
+ DatasetSplitDefinition(name="train", percentage=0.8),
46
+ DatasetSplitDefinition(name="test", percentage=0.2),
47
+ ]
48
+ Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
49
+ DatasetSplitDefinition(name="train", percentage=0.6),
50
+ DatasetSplitDefinition(name="test", percentage=0.2),
51
+ DatasetSplitDefinition(name="val", percentage=0.2),
52
+ ]
53
+ Train80Test10Val10SplitDefinition: list[DatasetSplitDefinition] = [
54
+ DatasetSplitDefinition(name="train", percentage=0.8),
55
+ DatasetSplitDefinition(name="test", percentage=0.1),
56
+ DatasetSplitDefinition(name="val", percentage=0.1),
57
+ ]
58
+
59
+
60
+ class DatasetSplit(KilnParentedModel):
61
+ """
62
+ A collection of task runs, with optional splits (train, test, validation).
63
+
64
+ Used to freeze a dataset into train/test/validation splits for repeatable fine-tuning or other tasks.
65
+
66
+ Maintains a list of IDs for each split, to avoid data duplication.
67
+ """
68
+
69
+ name: str = NAME_FIELD
70
+ description: str | None = Field(
71
+ default=None,
72
+ description="A description of the dataset for you and your team. Not used in training.",
73
+ )
74
+ splits: list[DatasetSplitDefinition] = Field(
75
+ default_factory=list,
76
+ description="The splits in the dataset.",
77
+ )
78
+ split_contents: dict[str, list[str]] = Field(
79
+ description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
80
+ )
81
+ filter: DatasetFilterId | None = Field(
82
+ default=None,
83
+ description="The filter used to build the dataset.",
84
+ )
85
+
86
+ @model_validator(mode="after")
87
+ def validate_split_percentages(self) -> "DatasetSplit":
88
+ total = sum(split.percentage for split in self.splits)
89
+ if not math.isclose(total, 1.0, rel_tol=1e-9):
90
+ raise ValueError(f"The sum of split percentages must be 1.0 (got {total})")
91
+ return self
92
+
93
+ @classmethod
94
+ def from_task(
95
+ cls,
96
+ name: str,
97
+ task: "Task",
98
+ splits: list[DatasetSplitDefinition],
99
+ filter_id: DatasetFilterId = "all",
100
+ description: str | None = None,
101
+ ):
102
+ """
103
+ Build a dataset split from a task.
104
+ """
105
+ filter = dataset_filter_from_id(filter_id)
106
+ split_contents = cls.build_split_contents(task, splits, filter)
107
+ return cls(
108
+ parent=task,
109
+ name=name,
110
+ description=description,
111
+ splits=splits,
112
+ split_contents=split_contents,
113
+ filter=filter_id,
114
+ )
115
+
116
+ @classmethod
117
+ def build_split_contents(
118
+ cls,
119
+ task: "Task",
120
+ splits: list[DatasetSplitDefinition],
121
+ filter: DatasetFilter,
122
+ ) -> dict[str, list[str]]:
123
+ valid_ids = []
124
+ for task_run in task.runs():
125
+ if filter(task_run):
126
+ valid_ids.append(task_run.id)
127
+
128
+ # Shuffle and split by split percentage
129
+ random.shuffle(valid_ids)
130
+ split_contents = {}
131
+ start_idx = 0
132
+ remaining_items = len(valid_ids)
133
+
134
+ # Handle all splits except the last one
135
+ for split in splits[:-1]:
136
+ split_size = round(len(valid_ids) * split.percentage)
137
+ split_contents[split.name] = valid_ids[start_idx : start_idx + split_size]
138
+ start_idx += split_size
139
+ remaining_items -= split_size
140
+
141
+ # Last split gets all remaining items (for rounding)
142
+ if splits:
143
+ split_contents[splits[-1].name] = valid_ids[start_idx:]
144
+
145
+ return split_contents
146
+
147
+ def parent_task(self) -> "Task | None":
148
+ # inline import to avoid circular import
149
+ from kiln_ai.datamodel import Task
150
+
151
+ if not isinstance(self.parent, Task):
152
+ return None
153
+ return self.parent
154
+
155
+ def missing_count(self) -> int:
156
+ """
157
+ Returns:
158
+ int: the number of task runs that have an ID persisted in this dataset split, but no longer exist in the dataset
159
+ """
160
+ parent = self.parent_task()
161
+ if parent is None:
162
+ raise ValueError("DatasetSplit has no parent task")
163
+
164
+ runs = parent.runs(readonly=True)
165
+ all_ids = set(run.id for run in runs)
166
+ all_ids_in_splits = set()
167
+ for ids in self.split_contents.values():
168
+ all_ids_in_splits.update(ids)
169
+ missing = all_ids_in_splits - all_ids
170
+ return len(missing)
@@ -0,0 +1,298 @@
1
+ import json
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
4
+
5
+ from pydantic import BaseModel, Field, model_validator
6
+ from typing_extensions import Self
7
+
8
+ from kiln_ai.datamodel.basemodel import (
9
+ ID_TYPE,
10
+ NAME_FIELD,
11
+ KilnParentedModel,
12
+ KilnParentModel,
13
+ )
14
+ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
15
+ from kiln_ai.datamodel.dataset_filters import DatasetFilterId
16
+ from kiln_ai.datamodel.json_schema import string_to_json_key
17
+ from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
18
+
19
+ if TYPE_CHECKING:
20
+ from kiln_ai.datamodel.task import Task
21
+
22
+ EvalScores = Dict[str, float]
23
+
24
+
25
+ class EvalTemplateId(str, Enum):
26
+ """
27
+ An eval template is a pre-defined eval that can be used as a starting point for a new eval.
28
+ """
29
+
30
+ kiln_requirements = "kiln_requirements"
31
+ toxicity = "toxicity"
32
+ bias = "bias"
33
+ maliciousness = "maliciousness"
34
+ factual_correctness = "factual_correctness"
35
+ jailbreak = "jailbreak"
36
+
37
+
38
+ class EvalConfigType(str, Enum):
39
+ g_eval = "g_eval"
40
+ llm_as_judge = "llm_as_judge"
41
+
42
+
43
+ class EvalOutputScore(BaseModel):
44
+ """
45
+ A definition of a score that an evaluator will produce.
46
+
47
+ Very similar to TaskRequirement, but conceptually different keeping in a separate models.
48
+ """
49
+
50
+ name: str = Field(
51
+ description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
52
+ )
53
+ instruction: str | None = Field(
54
+ default=None,
55
+ description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
56
+ )
57
+ type: TaskOutputRatingType = Field(
58
+ description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
59
+ )
60
+
61
+ def json_key(self) -> str:
62
+ """
63
+ The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
64
+
65
+ For example, "Overall Rating" -> "overall_rating"
66
+ """
67
+ return string_to_json_key(self.name)
68
+
69
+ @model_validator(mode="after")
70
+ def validate_type(self) -> Self:
71
+ if self.type == TaskOutputRatingType.custom:
72
+ raise ValueError(
73
+ f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
74
+ )
75
+ return self
76
+
77
+
78
+ class EvalRun(KilnParentedModel):
79
+ """
80
+ The results of running an eval on a single dataset item.
81
+
82
+ This is a child of an EvalConfig, which specifies how the scores were generated.
83
+
84
+ Eval runs can be one of 2 types:
85
+ 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
86
+ 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
87
+ """
88
+
89
+ dataset_id: ID_TYPE = Field(
90
+ description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
91
+ )
92
+ task_run_config_id: ID_TYPE | None = Field(
93
+ description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
94
+ )
95
+ eval_config_eval: bool = Field(
96
+ description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
97
+ default=False,
98
+ )
99
+ # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
100
+ input: str = Field(
101
+ description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
102
+ )
103
+ output: str = Field(
104
+ description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
105
+ )
106
+ intermediate_outputs: Dict[str, str] | None = Field(
107
+ default=None,
108
+ description="The intermediate outputs of the task (example, eval thinking).",
109
+ )
110
+ scores: EvalScores = Field(
111
+ description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
112
+ )
113
+
114
+ def parent_eval_config(self) -> Union["EvalConfig", None]:
115
+ if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
116
+ raise ValueError("parent must be an EvalConfig")
117
+ return self.parent # type: ignore
118
+
119
+ @model_validator(mode="after")
120
+ def validate_eval_run_types(self) -> Self:
121
+ if self.eval_config_eval and self.task_run_config_id is not None:
122
+ raise ValueError(
123
+ "task_run_config_id must be None if eval_config_eval is true"
124
+ )
125
+ if not self.eval_config_eval and self.task_run_config_id is None:
126
+ raise ValueError(
127
+ "task_run_config_id must be set if eval_config_eval is false"
128
+ )
129
+ return self
130
+
131
+ @model_validator(mode="after")
132
+ def validate_scores(self) -> Self:
133
+ # We're checking the scores have the expected keys from the grand-parent eval
134
+ if self.scores is None or len(self.scores) == 0:
135
+ raise ValueError("scores are required, and must have at least one score.")
136
+
137
+ parent_eval_config = self.parent_eval_config()
138
+ eval = parent_eval_config.parent_eval() if parent_eval_config else None
139
+ if not eval:
140
+ # Can't validate without the grand-parent eval, allow it to be validated later
141
+ return self
142
+
143
+ output_score_keys = [score.json_key() for score in eval.output_scores]
144
+ if set(output_score_keys) != set(self.scores.keys()):
145
+ raise ValueError(
146
+ f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
147
+ )
148
+
149
+ # Check that each score is expected in this eval and the correct type
150
+ for output_score in eval.output_scores:
151
+ match output_score.type:
152
+ case TaskOutputRatingType.five_star:
153
+ five_star_score = self.scores[output_score.json_key()]
154
+ if (
155
+ not isinstance(five_star_score, float)
156
+ or five_star_score < 1.0
157
+ or five_star_score > 5.0
158
+ ):
159
+ raise ValueError(
160
+ f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
161
+ )
162
+ case TaskOutputRatingType.pass_fail:
163
+ pass_fail_score = self.scores[output_score.json_key()]
164
+ if (
165
+ not isinstance(pass_fail_score, float)
166
+ or pass_fail_score < 0.0
167
+ or pass_fail_score > 1.0
168
+ ):
169
+ raise ValueError(
170
+ f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
171
+ )
172
+ case TaskOutputRatingType.pass_fail_critical:
173
+ pass_fail_critical_score = self.scores[output_score.json_key()]
174
+ if (
175
+ not isinstance(pass_fail_critical_score, float)
176
+ or pass_fail_critical_score < -1.0
177
+ or pass_fail_critical_score > 1.0
178
+ ):
179
+ raise ValueError(
180
+ f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
181
+ )
182
+ case TaskOutputRatingType.custom:
183
+ raise ValueError(
184
+ f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
185
+ )
186
+ case _:
187
+ # Catch missing cases
188
+ raise_exhaustive_enum_error(output_score.type)
189
+ return self
190
+
191
+
192
+ class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
193
+ """
194
+ A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
195
+
196
+ A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
197
+ """
198
+
199
+ name: str = NAME_FIELD
200
+ model_name: str = Field(
201
+ description="The name of the model to use for this eval config. ",
202
+ )
203
+ model_provider: str = Field(
204
+ description="The provider of the model to use for this eval config.",
205
+ )
206
+ config_type: EvalConfigType = Field(
207
+ default=EvalConfigType.g_eval,
208
+ description="This is used to determine the type of eval to run.",
209
+ )
210
+ properties: dict[str, Any] = Field(
211
+ default={},
212
+ description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
213
+ )
214
+
215
+ def parent_eval(self) -> Union["Eval", None]:
216
+ if self.parent is not None and self.parent.__class__.__name__ != "Eval":
217
+ raise ValueError("parent must be an Eval")
218
+ return self.parent # type: ignore
219
+
220
+ def runs(self, readonly: bool = False) -> list[EvalRun]:
221
+ return super().runs(readonly=readonly) # type: ignore
222
+
223
+ @model_validator(mode="after")
224
+ def validate_properties(self) -> Self:
225
+ if (
226
+ self.config_type == EvalConfigType.g_eval
227
+ or self.config_type == EvalConfigType.llm_as_judge
228
+ ):
229
+ if "eval_steps" not in self.properties or not isinstance(
230
+ self.properties["eval_steps"], list
231
+ ):
232
+ raise ValueError("eval_steps is required and must be a list for g_eval")
233
+ if "task_description" in self.properties and not isinstance(
234
+ self.properties["task_description"], str
235
+ ):
236
+ raise ValueError(
237
+ "task_description is optional, but if provided must be a string"
238
+ )
239
+ return self
240
+ else:
241
+ raise ValueError(f"Invalid eval config type: {self.config_type}")
242
+
243
+ @model_validator(mode="after")
244
+ def validate_json_serializable(self) -> "EvalConfig":
245
+ try:
246
+ # This will raise a TypeError if the dict contains non-JSON-serializable objects
247
+ json.dumps(self.properties)
248
+ except TypeError as e:
249
+ raise ValueError(f"Properties must be JSON serializable: {str(e)}")
250
+ return self
251
+
252
+
253
+ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
254
+ name: str = NAME_FIELD
255
+ description: str | None = Field(
256
+ default=None, description="The description of the eval"
257
+ )
258
+ template: EvalTemplateId | None = Field(
259
+ default=None,
260
+ description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
261
+ )
262
+ current_config_id: ID_TYPE = Field(
263
+ default=None,
264
+ description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
265
+ )
266
+ eval_set_filter_id: DatasetFilterId = Field(
267
+ description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
268
+ )
269
+ eval_configs_filter_id: DatasetFilterId = Field(
270
+ description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
271
+ )
272
+ output_scores: List[EvalOutputScore] = Field(
273
+ description="The scores this evaluator should produce."
274
+ )
275
+
276
+ # Workaround to return typed parent without importing Task
277
+ def parent_task(self) -> Union["Task", None]:
278
+ if self.parent is not None and self.parent.__class__.__name__ != "Task":
279
+ raise ValueError("parent must be a Task")
280
+ return self.parent # type: ignore
281
+
282
+ def configs(self, readonly: bool = False) -> list[EvalConfig]:
283
+ return super().configs(readonly=readonly) # type: ignore
284
+
285
+ @model_validator(mode="after")
286
+ def validate_scores(self) -> Self:
287
+ if self.output_scores is None or len(self.output_scores) == 0:
288
+ raise ValueError(
289
+ "output_scores are required, and must have at least one score."
290
+ )
291
+
292
+ # check for duplicate names (once transformed to JSON keys)
293
+ output_score_keys = [score.json_key() for score in self.output_scores]
294
+ if len(output_score_keys) != len(set(output_score_keys)):
295
+ raise ValueError(
296
+ f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
297
+ )
298
+ return self