kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/adapter_registry.py +12 -13
- kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +141 -29
- kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
- kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
- kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
- kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
- kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/prompt_builders.py +80 -42
- kiln_ai/adapters/repair/repair_task.py +9 -21
- kiln_ai/adapters/repair/test_repair_task.py +3 -3
- kiln_ai/adapters/run_output.py +3 -0
- kiln_ai/adapters/test_adapter_registry.py +10 -10
- kiln_ai/adapters/test_generate_docs.py +6 -6
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +17 -14
- kiln_ai/adapters/test_prompt_builders.py +91 -31
- kiln_ai/datamodel/__init__.py +50 -952
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +6 -0
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +10 -11
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +32 -8
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +9 -13
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_models.py +2 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai-0.11.1.dist-info/RECORD +0 -76
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -18,14 +18,14 @@ def test_valid_synthetic_data_source():
|
|
|
18
18
|
properties={
|
|
19
19
|
"model_name": "GPT-4",
|
|
20
20
|
"model_provider": "OpenAI",
|
|
21
|
-
"
|
|
21
|
+
"prompt_id": "simple_prompt_builder",
|
|
22
22
|
"adapter_name": "langchain",
|
|
23
23
|
},
|
|
24
24
|
)
|
|
25
25
|
assert data_source.type == DataSourceType.synthetic
|
|
26
26
|
assert data_source.properties["model_name"] == "GPT-4"
|
|
27
27
|
assert data_source.properties["model_provider"] == "OpenAI"
|
|
28
|
-
assert data_source.properties["
|
|
28
|
+
assert data_source.properties["prompt_id"] == "simple_prompt_builder"
|
|
29
29
|
assert data_source.properties["adapter_name"] == "langchain"
|
|
30
30
|
|
|
31
31
|
|
|
@@ -85,6 +85,7 @@ def test_prompt_type_optional_for_synthetic():
|
|
|
85
85
|
},
|
|
86
86
|
)
|
|
87
87
|
assert "prompt_builder_name" not in data_source.properties
|
|
88
|
+
assert "prompt_id" not in data_source.properties
|
|
88
89
|
|
|
89
90
|
|
|
90
91
|
def test_private_data_source_properties_not_serialized():
|
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import ValidationError
|
|
3
|
+
|
|
4
|
+
from kiln_ai.datamodel import BasePrompt
|
|
5
|
+
from kiln_ai.datamodel.basemodel import KilnParentModel
|
|
6
|
+
from kiln_ai.datamodel.eval import (
|
|
7
|
+
Eval,
|
|
8
|
+
EvalConfig,
|
|
9
|
+
EvalConfigType,
|
|
10
|
+
EvalOutputScore,
|
|
11
|
+
EvalRun,
|
|
12
|
+
)
|
|
13
|
+
from kiln_ai.datamodel.task import Task
|
|
14
|
+
from kiln_ai.datamodel.task_output import (
|
|
15
|
+
TaskOutputRatingType,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def mock_task():
|
|
21
|
+
return Task(name="Test Task", instruction="Test instruction")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def valid_eval_config_data():
|
|
26
|
+
return {
|
|
27
|
+
"name": "Test Eval Config",
|
|
28
|
+
"config_type": EvalConfigType.g_eval,
|
|
29
|
+
"properties": {"eval_steps": ["step1", "step2"]},
|
|
30
|
+
"model_name": "gpt-4",
|
|
31
|
+
"model_provider": "openai",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.fixture
|
|
36
|
+
def valid_eval_config(valid_eval_config_data):
|
|
37
|
+
return EvalConfig(**valid_eval_config_data)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_eval_config_valid(valid_eval_config):
|
|
41
|
+
assert valid_eval_config.name == "Test Eval Config"
|
|
42
|
+
assert valid_eval_config.config_type == EvalConfigType.g_eval
|
|
43
|
+
assert valid_eval_config.properties["eval_steps"] == ["step1", "step2"]
|
|
44
|
+
assert valid_eval_config.model_name == "gpt-4"
|
|
45
|
+
assert valid_eval_config.model_provider == "openai"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_eval_config_missing_eval_steps(valid_eval_config):
|
|
49
|
+
with pytest.raises(
|
|
50
|
+
ValueError, match="eval_steps is required and must be a list for g_eval"
|
|
51
|
+
):
|
|
52
|
+
valid_eval_config.properties = {}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_eval_config_missing_task_description(valid_eval_config):
|
|
56
|
+
with pytest.raises(
|
|
57
|
+
ValueError,
|
|
58
|
+
match="task_description is optional, but if provided must be a string",
|
|
59
|
+
):
|
|
60
|
+
valid_eval_config.properties = {"task_description": 123, "eval_steps": []}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_eval_config_invalid_json(valid_eval_config):
|
|
64
|
+
class InvalidClass:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
with pytest.raises(ValueError, match="Properties must be JSON serializable"):
|
|
68
|
+
valid_eval_config.properties = {
|
|
69
|
+
"eval_steps": [],
|
|
70
|
+
"invalid_key": InvalidClass(),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_eval_config_invalid_eval_steps_type(valid_eval_config):
|
|
75
|
+
with pytest.raises(
|
|
76
|
+
ValueError, match="eval_steps is required and must be a list for g_eval"
|
|
77
|
+
):
|
|
78
|
+
valid_eval_config.properties = {"eval_steps": "not a list"}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_eval_config_invalid_config_type(valid_eval_config):
|
|
82
|
+
# Create an invalid config type using string
|
|
83
|
+
with pytest.raises(ValueError):
|
|
84
|
+
valid_eval_config.config_type = "invalid_type"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_eval_basic_properties():
|
|
88
|
+
eval = Eval(
|
|
89
|
+
name="Test Eval",
|
|
90
|
+
description="Test Description",
|
|
91
|
+
current_config_id="config123",
|
|
92
|
+
eval_set_filter_id="tag::tag1",
|
|
93
|
+
eval_configs_filter_id="tag::tag2",
|
|
94
|
+
output_scores=[
|
|
95
|
+
EvalOutputScore(
|
|
96
|
+
name="accuracy",
|
|
97
|
+
type=TaskOutputRatingType.five_star,
|
|
98
|
+
)
|
|
99
|
+
],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
assert eval.name == "Test Eval"
|
|
103
|
+
assert eval.description == "Test Description"
|
|
104
|
+
assert eval.current_config_id == "config123"
|
|
105
|
+
assert eval.output_scores[0].name == "accuracy"
|
|
106
|
+
assert eval.output_scores[0].type == TaskOutputRatingType.five_star
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_eval_default_values():
|
|
110
|
+
eval = Eval(
|
|
111
|
+
name="Test Eval",
|
|
112
|
+
eval_set_filter_id="tag::tag1",
|
|
113
|
+
eval_configs_filter_id="tag::tag2",
|
|
114
|
+
output_scores=[
|
|
115
|
+
EvalOutputScore(
|
|
116
|
+
name="quality",
|
|
117
|
+
type=TaskOutputRatingType.pass_fail,
|
|
118
|
+
)
|
|
119
|
+
],
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
assert eval.description is None
|
|
123
|
+
assert eval.current_config_id is None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_eval_parent_task_relationship(mock_task, valid_eval_config_data):
|
|
127
|
+
eval = Eval(
|
|
128
|
+
name="Test Eval",
|
|
129
|
+
parent=mock_task,
|
|
130
|
+
eval_set_filter_id="tag::tag1",
|
|
131
|
+
eval_configs_filter_id="tag::tag2",
|
|
132
|
+
output_scores=[
|
|
133
|
+
EvalOutputScore(
|
|
134
|
+
name="score",
|
|
135
|
+
type=TaskOutputRatingType.pass_fail,
|
|
136
|
+
)
|
|
137
|
+
],
|
|
138
|
+
)
|
|
139
|
+
config = EvalConfig(parent=eval, **valid_eval_config_data)
|
|
140
|
+
|
|
141
|
+
assert eval.parent_task() == mock_task
|
|
142
|
+
assert eval.parent == mock_task
|
|
143
|
+
assert config.parent == eval
|
|
144
|
+
assert config.parent_eval() == eval
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_eval_parent_task_none():
|
|
148
|
+
eval = Eval(
|
|
149
|
+
name="Test Eval",
|
|
150
|
+
eval_set_filter_id="tag::tag1",
|
|
151
|
+
eval_configs_filter_id="tag::tag2",
|
|
152
|
+
output_scores=[
|
|
153
|
+
EvalOutputScore(
|
|
154
|
+
name="score",
|
|
155
|
+
type=TaskOutputRatingType.pass_fail,
|
|
156
|
+
)
|
|
157
|
+
],
|
|
158
|
+
)
|
|
159
|
+
assert eval.parent_task() is None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_eval_parent_task_wrong_type():
|
|
163
|
+
# Create a non-Task parent
|
|
164
|
+
class DummyParent(KilnParentModel, parent_of={}):
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
with pytest.raises(ValueError):
|
|
168
|
+
Eval(name="Test Eval", parent=DummyParent())
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_path):
|
|
172
|
+
task_path = tmp_path / "task.kiln"
|
|
173
|
+
mock_task.path = task_path
|
|
174
|
+
mock_task.save_to_file()
|
|
175
|
+
|
|
176
|
+
eval = Eval(
|
|
177
|
+
name="Test Eval",
|
|
178
|
+
parent=mock_task,
|
|
179
|
+
eval_set_filter_id="tag::tag1",
|
|
180
|
+
eval_configs_filter_id="tag::tag2",
|
|
181
|
+
output_scores=[
|
|
182
|
+
EvalOutputScore(
|
|
183
|
+
name="accuracy",
|
|
184
|
+
type=TaskOutputRatingType.pass_fail,
|
|
185
|
+
)
|
|
186
|
+
],
|
|
187
|
+
)
|
|
188
|
+
eval.save_to_file()
|
|
189
|
+
|
|
190
|
+
# Add config using the parent relationship
|
|
191
|
+
config = EvalConfig(parent=eval, **valid_eval_config_data)
|
|
192
|
+
config.save_to_file()
|
|
193
|
+
|
|
194
|
+
run = EvalRun(
|
|
195
|
+
parent=config,
|
|
196
|
+
dataset_id="dataset123",
|
|
197
|
+
task_run_config_id="config456",
|
|
198
|
+
input='{"key": "value"}',
|
|
199
|
+
output='{"result": "success"}',
|
|
200
|
+
scores={"accuracy": 0.95},
|
|
201
|
+
)
|
|
202
|
+
run.save_to_file()
|
|
203
|
+
|
|
204
|
+
# Test configs can be retrieved from disk
|
|
205
|
+
evals = mock_task.evals()
|
|
206
|
+
assert len(evals) == 1
|
|
207
|
+
assert evals[0].name == "Test Eval"
|
|
208
|
+
configs = evals[0].configs()
|
|
209
|
+
assert len(configs) == 1
|
|
210
|
+
assert configs[0].model_provider == "openai"
|
|
211
|
+
assert configs[0].model_name == "gpt-4"
|
|
212
|
+
|
|
213
|
+
# and back up
|
|
214
|
+
assert configs[0].parent_eval().parent_task().path == task_path
|
|
215
|
+
|
|
216
|
+
# Test runs can be retrieved from disk
|
|
217
|
+
runs = configs[0].runs()
|
|
218
|
+
assert len(runs) == 1
|
|
219
|
+
assert runs[0].dataset_id == "dataset123"
|
|
220
|
+
assert runs[0].task_run_config_id == "config456"
|
|
221
|
+
assert runs[0].input == '{"key": "value"}'
|
|
222
|
+
assert runs[0].output == '{"result": "success"}'
|
|
223
|
+
assert runs[0].scores == {"accuracy": 0.95}
|
|
224
|
+
|
|
225
|
+
# and back up
|
|
226
|
+
assert runs[0].parent_eval_config().parent_eval().parent_task().path == task_path
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def test_eval_run_valid_creation():
|
|
230
|
+
"""Test creating an EvalRun with valid data"""
|
|
231
|
+
eval_run = EvalRun(
|
|
232
|
+
dataset_id="dataset123",
|
|
233
|
+
task_run_config_id="config456",
|
|
234
|
+
input='{"key": "value"}', # JSON formatted input
|
|
235
|
+
output='{"result": "success"}', # JSON formatted output
|
|
236
|
+
scores={"accuracy": 0.95},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
assert eval_run.dataset_id == "dataset123"
|
|
240
|
+
assert eval_run.task_run_config_id == "config456"
|
|
241
|
+
assert eval_run.input == '{"key": "value"}'
|
|
242
|
+
assert eval_run.output == '{"result": "success"}'
|
|
243
|
+
assert eval_run.scores == {"accuracy": 0.95}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_eval_run_plaintext():
|
|
247
|
+
"""Test creating an EvalRun with plaintext input/output"""
|
|
248
|
+
eval_run = EvalRun(
|
|
249
|
+
dataset_id="dataset123",
|
|
250
|
+
task_run_config_id="config456",
|
|
251
|
+
input="What is the capital of France?",
|
|
252
|
+
output="The capital of France is Paris.",
|
|
253
|
+
scores={"accuracy": 1.0},
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
assert eval_run.input == "What is the capital of France?"
|
|
257
|
+
assert eval_run.output == "The capital of France is Paris."
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def test_eval_run_missing_required_fields():
|
|
261
|
+
"""Test that omitting required fields raises ValidationError"""
|
|
262
|
+
with pytest.raises(ValidationError) as exc_info:
|
|
263
|
+
EvalRun(
|
|
264
|
+
dataset_id="dataset123",
|
|
265
|
+
# missing task_run_config_id
|
|
266
|
+
input="test",
|
|
267
|
+
output="test",
|
|
268
|
+
scores={"score": 1.0},
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
assert "task_run_config_id" in str(exc_info.value)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def test_eval_run_invalid_scores():
|
|
275
|
+
"""Test that scores must be a dict of floats"""
|
|
276
|
+
with pytest.raises(ValidationError):
|
|
277
|
+
EvalRun(
|
|
278
|
+
dataset_id="dataset123",
|
|
279
|
+
task_run_config_id="config456",
|
|
280
|
+
input="test",
|
|
281
|
+
output="test",
|
|
282
|
+
scores={"score": "not a float"}, # invalid score type
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def test_eval_missing_output_scores():
|
|
287
|
+
"""Test that eval creation fails when output_scores is missing"""
|
|
288
|
+
with pytest.raises(ValidationError) as exc_info:
|
|
289
|
+
Eval(
|
|
290
|
+
name="Test Eval",
|
|
291
|
+
eval_set_filter_id="tag::tag1",
|
|
292
|
+
eval_configs_filter_id="tag::tag2",
|
|
293
|
+
)
|
|
294
|
+
assert "output_scores" in str(exc_info.value)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def test_eval_empty_output_scores():
|
|
298
|
+
"""Test that eval creation fails when output_scores is empty"""
|
|
299
|
+
with pytest.raises(
|
|
300
|
+
ValueError, match="output_scores are required, and must have at least one score"
|
|
301
|
+
):
|
|
302
|
+
Eval(
|
|
303
|
+
name="Test Eval",
|
|
304
|
+
eval_set_filter_id="tag::tag1",
|
|
305
|
+
eval_configs_filter_id="tag::tag2",
|
|
306
|
+
output_scores=[],
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def test_eval_duplicate_output_scores():
|
|
311
|
+
"""Test that eval creation fails when output_scores has duplicate names"""
|
|
312
|
+
with pytest.raises(
|
|
313
|
+
ValueError,
|
|
314
|
+
match="must have unique names",
|
|
315
|
+
):
|
|
316
|
+
Eval(
|
|
317
|
+
name="Test Eval",
|
|
318
|
+
eval_set_filter_id="tag::tag1",
|
|
319
|
+
eval_configs_filter_id="tag::tag2",
|
|
320
|
+
output_scores=[
|
|
321
|
+
EvalOutputScore(
|
|
322
|
+
name="score",
|
|
323
|
+
type=TaskOutputRatingType.five_star,
|
|
324
|
+
),
|
|
325
|
+
EvalOutputScore(name="SCORE", type=TaskOutputRatingType.pass_fail),
|
|
326
|
+
],
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def test_eval_invalid_score_type():
|
|
331
|
+
"""Test that eval creation fails with invalid rating type in output_scores"""
|
|
332
|
+
with pytest.raises(
|
|
333
|
+
ValueError,
|
|
334
|
+
match="Input should be 'five_star', 'pass_fail', 'pass_fail_critical'",
|
|
335
|
+
):
|
|
336
|
+
Eval(
|
|
337
|
+
name="Test Eval",
|
|
338
|
+
eval_set_filter_id="tag::tag1",
|
|
339
|
+
eval_configs_filter_id="tag::tag2",
|
|
340
|
+
output_scores=[
|
|
341
|
+
EvalOutputScore(
|
|
342
|
+
name="score",
|
|
343
|
+
type="invalid_type",
|
|
344
|
+
)
|
|
345
|
+
],
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_eval_valid_output_scores():
|
|
350
|
+
"""Test that eval creation succeeds with valid output_scores"""
|
|
351
|
+
eval = Eval(
|
|
352
|
+
name="Test Eval",
|
|
353
|
+
eval_set_filter_id="tag::tag1",
|
|
354
|
+
eval_configs_filter_id="tag::tag2",
|
|
355
|
+
output_scores=[
|
|
356
|
+
EvalOutputScore(
|
|
357
|
+
name="accuracy",
|
|
358
|
+
type=TaskOutputRatingType.five_star,
|
|
359
|
+
),
|
|
360
|
+
EvalOutputScore(
|
|
361
|
+
name="critical_check",
|
|
362
|
+
type=TaskOutputRatingType.pass_fail_critical,
|
|
363
|
+
),
|
|
364
|
+
EvalOutputScore(name="basic_check", type=TaskOutputRatingType.pass_fail),
|
|
365
|
+
],
|
|
366
|
+
)
|
|
367
|
+
assert len(eval.output_scores) == 3
|
|
368
|
+
assert eval.output_scores[0].type == TaskOutputRatingType.five_star
|
|
369
|
+
assert eval.output_scores[0].name == "accuracy"
|
|
370
|
+
assert eval.output_scores[1].type == TaskOutputRatingType.pass_fail_critical
|
|
371
|
+
assert eval.output_scores[1].name == "critical_check"
|
|
372
|
+
assert eval.output_scores[2].type == TaskOutputRatingType.pass_fail
|
|
373
|
+
assert eval.output_scores[2].name == "basic_check"
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@pytest.fixture
|
|
377
|
+
def valid_eval_run_data():
|
|
378
|
+
return {
|
|
379
|
+
"dataset_id": "dataset123",
|
|
380
|
+
"task_run_config_id": "config456",
|
|
381
|
+
"input": "test input",
|
|
382
|
+
"output": "test output",
|
|
383
|
+
"scores": {"accuracy": 4.5},
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def test_eval_run_five_star_score_validation(valid_eval_config, valid_eval_run_data):
|
|
388
|
+
# Setup eval with five_star rating
|
|
389
|
+
eval = Eval(
|
|
390
|
+
name="Test Eval",
|
|
391
|
+
eval_set_filter_id="tag::tag1",
|
|
392
|
+
eval_configs_filter_id="tag::tag2",
|
|
393
|
+
output_scores=[
|
|
394
|
+
EvalOutputScore(
|
|
395
|
+
name="accuracy",
|
|
396
|
+
type=TaskOutputRatingType.five_star,
|
|
397
|
+
)
|
|
398
|
+
],
|
|
399
|
+
)
|
|
400
|
+
valid_eval_config.parent = eval
|
|
401
|
+
|
|
402
|
+
# Valid score
|
|
403
|
+
run = EvalRun(parent=valid_eval_config, **valid_eval_run_data)
|
|
404
|
+
assert run.scores["accuracy"] == 4.5
|
|
405
|
+
|
|
406
|
+
# Invalid scores
|
|
407
|
+
with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"):
|
|
408
|
+
run = EvalRun(
|
|
409
|
+
parent=valid_eval_config,
|
|
410
|
+
**{**valid_eval_run_data, "scores": {"accuracy": 0.5}},
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"):
|
|
414
|
+
run = EvalRun(
|
|
415
|
+
parent=valid_eval_config,
|
|
416
|
+
**{**valid_eval_run_data, "scores": {"accuracy": 5.5}},
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def test_eval_run_pass_fail_score_validation(valid_eval_config, valid_eval_run_data):
|
|
421
|
+
# Setup eval with pass_fail rating
|
|
422
|
+
eval = Eval(
|
|
423
|
+
name="Test Eval",
|
|
424
|
+
eval_set_filter_id="tag::tag1",
|
|
425
|
+
eval_configs_filter_id="tag::tag2",
|
|
426
|
+
output_scores=[
|
|
427
|
+
EvalOutputScore(
|
|
428
|
+
name="check",
|
|
429
|
+
type=TaskOutputRatingType.pass_fail,
|
|
430
|
+
)
|
|
431
|
+
],
|
|
432
|
+
)
|
|
433
|
+
valid_eval_config.parent = eval
|
|
434
|
+
|
|
435
|
+
# Valid scores
|
|
436
|
+
run = EvalRun(
|
|
437
|
+
parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 1.0}}
|
|
438
|
+
)
|
|
439
|
+
assert run.scores["check"] == 1.0
|
|
440
|
+
|
|
441
|
+
run = EvalRun(
|
|
442
|
+
parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 0.0}}
|
|
443
|
+
)
|
|
444
|
+
assert run.scores["check"] == 0.0
|
|
445
|
+
|
|
446
|
+
# Invalid scores
|
|
447
|
+
with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"):
|
|
448
|
+
run = EvalRun(
|
|
449
|
+
parent=valid_eval_config,
|
|
450
|
+
**{**valid_eval_run_data, "scores": {"check": -0.1}},
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"):
|
|
454
|
+
run = EvalRun(
|
|
455
|
+
parent=valid_eval_config,
|
|
456
|
+
**{**valid_eval_run_data, "scores": {"check": 1.1}},
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def test_eval_run_pass_fail_critical_score_validation(
|
|
461
|
+
valid_eval_config, valid_eval_run_data
|
|
462
|
+
):
|
|
463
|
+
# Setup eval with pass_fail_critical rating
|
|
464
|
+
eval = Eval(
|
|
465
|
+
name="Test Eval",
|
|
466
|
+
eval_set_filter_id="tag::tag1",
|
|
467
|
+
eval_configs_filter_id="tag::tag2",
|
|
468
|
+
output_scores=[
|
|
469
|
+
EvalOutputScore(
|
|
470
|
+
name="critical",
|
|
471
|
+
type=TaskOutputRatingType.pass_fail_critical,
|
|
472
|
+
)
|
|
473
|
+
],
|
|
474
|
+
)
|
|
475
|
+
valid_eval_config.parent = eval
|
|
476
|
+
|
|
477
|
+
# Valid scores
|
|
478
|
+
run = EvalRun(
|
|
479
|
+
parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"critical": 1.0}}
|
|
480
|
+
)
|
|
481
|
+
assert run.scores["critical"] == 1.0
|
|
482
|
+
|
|
483
|
+
run = EvalRun(
|
|
484
|
+
parent=valid_eval_config,
|
|
485
|
+
**{**valid_eval_run_data, "scores": {"critical": -1.0}},
|
|
486
|
+
)
|
|
487
|
+
assert run.scores["critical"] == -1.0
|
|
488
|
+
|
|
489
|
+
# Invalid scores
|
|
490
|
+
with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"):
|
|
491
|
+
run = EvalRun(
|
|
492
|
+
parent=valid_eval_config,
|
|
493
|
+
**{**valid_eval_run_data, "scores": {"critical": -1.1}},
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"):
|
|
497
|
+
run = EvalRun(
|
|
498
|
+
parent=valid_eval_config,
|
|
499
|
+
**{**valid_eval_run_data, "scores": {"critical": 1.1}},
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
|
|
504
|
+
eval = Eval(
|
|
505
|
+
name="Test Eval",
|
|
506
|
+
eval_set_filter_id="tag::tag1",
|
|
507
|
+
eval_configs_filter_id="tag::tag2",
|
|
508
|
+
output_scores=[
|
|
509
|
+
EvalOutputScore(
|
|
510
|
+
name="accuracy",
|
|
511
|
+
type=TaskOutputRatingType.five_star,
|
|
512
|
+
),
|
|
513
|
+
EvalOutputScore(
|
|
514
|
+
name="critical",
|
|
515
|
+
type=TaskOutputRatingType.pass_fail_critical,
|
|
516
|
+
),
|
|
517
|
+
],
|
|
518
|
+
)
|
|
519
|
+
valid_eval_config.parent = eval
|
|
520
|
+
|
|
521
|
+
# Correct
|
|
522
|
+
run = EvalRun(
|
|
523
|
+
parent=valid_eval_config,
|
|
524
|
+
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}},
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Correct but wrong order still okay
|
|
528
|
+
run = EvalRun(
|
|
529
|
+
parent=valid_eval_config,
|
|
530
|
+
**{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}},
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Missing score
|
|
534
|
+
with pytest.raises(
|
|
535
|
+
ValueError,
|
|
536
|
+
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
537
|
+
):
|
|
538
|
+
run = EvalRun(
|
|
539
|
+
parent=valid_eval_config,
|
|
540
|
+
**{**valid_eval_run_data, "scores": {"accuracy": 4.5}},
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# Extra score
|
|
544
|
+
with pytest.raises(
|
|
545
|
+
ValueError,
|
|
546
|
+
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
547
|
+
):
|
|
548
|
+
run = EvalRun(
|
|
549
|
+
parent=valid_eval_config,
|
|
550
|
+
**{
|
|
551
|
+
**valid_eval_run_data,
|
|
552
|
+
"scores": {"accuracy": 4.5, "critical": 1.0, "extra": 1.0},
|
|
553
|
+
},
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
# Missing score w matching count
|
|
557
|
+
with pytest.raises(
|
|
558
|
+
ValueError,
|
|
559
|
+
match="The scores produced by the evaluator must match the scores expected by the eval",
|
|
560
|
+
):
|
|
561
|
+
run = EvalRun(
|
|
562
|
+
parent=valid_eval_config,
|
|
563
|
+
**{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}},
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_data):
|
|
568
|
+
with pytest.raises(
|
|
569
|
+
ValueError, match="Custom scores are not supported in evaluators"
|
|
570
|
+
):
|
|
571
|
+
eval = Eval(
|
|
572
|
+
name="Test Eval",
|
|
573
|
+
eval_set_filter_id="tag::tag1",
|
|
574
|
+
eval_configs_filter_id="tag::tag2",
|
|
575
|
+
output_scores=[
|
|
576
|
+
EvalOutputScore(
|
|
577
|
+
name="custom",
|
|
578
|
+
type=TaskOutputRatingType.custom,
|
|
579
|
+
)
|
|
580
|
+
],
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def test_eval_run_eval_config_eval_validation():
|
|
585
|
+
"""Test that eval_config_eval and task_run_config_id validation works correctly"""
|
|
586
|
+
|
|
587
|
+
# Case 1: Valid configuration - eval_config_eval=True and task_run_config_id=None
|
|
588
|
+
valid_run1 = EvalRun(
|
|
589
|
+
dataset_id="dataset123",
|
|
590
|
+
eval_config_eval=True,
|
|
591
|
+
task_run_config_id=None,
|
|
592
|
+
input="test input",
|
|
593
|
+
output="test output",
|
|
594
|
+
scores={"score": 1.0},
|
|
595
|
+
)
|
|
596
|
+
assert valid_run1.eval_config_eval is True
|
|
597
|
+
assert valid_run1.task_run_config_id is None
|
|
598
|
+
|
|
599
|
+
# Case 2: Valid configuration - eval_config_eval=False and task_run_config_id is set
|
|
600
|
+
valid_run2 = EvalRun(
|
|
601
|
+
dataset_id="dataset123",
|
|
602
|
+
eval_config_eval=False,
|
|
603
|
+
task_run_config_id="config456",
|
|
604
|
+
input="test input",
|
|
605
|
+
output="test output",
|
|
606
|
+
scores={"score": 1.0},
|
|
607
|
+
)
|
|
608
|
+
assert valid_run2.eval_config_eval is False
|
|
609
|
+
assert valid_run2.task_run_config_id == "config456"
|
|
610
|
+
|
|
611
|
+
# Case 3: Invalid configuration - eval_config_eval=True but task_run_config_id is set
|
|
612
|
+
with pytest.raises(
|
|
613
|
+
ValueError, match="task_run_config_id must be None if eval_config_eval is true"
|
|
614
|
+
):
|
|
615
|
+
EvalRun(
|
|
616
|
+
dataset_id="dataset123",
|
|
617
|
+
eval_config_eval=True,
|
|
618
|
+
task_run_config_id="config456",
|
|
619
|
+
input="test input",
|
|
620
|
+
output="test output",
|
|
621
|
+
scores={"score": 1.0},
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
# Case 4: Invalid configuration - eval_config_eval=False but task_run_config_id is None
|
|
625
|
+
with pytest.raises(
|
|
626
|
+
ValueError, match="task_run_config_id must be set if eval_config_eval is false"
|
|
627
|
+
):
|
|
628
|
+
EvalRun(
|
|
629
|
+
dataset_id="dataset123",
|
|
630
|
+
eval_config_eval=False,
|
|
631
|
+
task_run_config_id=None,
|
|
632
|
+
input="test input",
|
|
633
|
+
output="test output",
|
|
634
|
+
scores={"score": 1.0},
|
|
635
|
+
)
|