kiln-ai 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (63) hide show
  1. kiln_ai/adapters/adapter_registry.py +12 -13
  2. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  3. kiln_ai/adapters/eval/base_eval.py +164 -0
  4. kiln_ai/adapters/eval/eval_runner.py +267 -0
  5. kiln_ai/adapters/eval/g_eval.py +367 -0
  6. kiln_ai/adapters/eval/registry.py +16 -0
  7. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  8. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  9. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  10. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  11. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  12. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  13. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  14. kiln_ai/adapters/ml_model_list.py +141 -29
  15. kiln_ai/adapters/model_adapters/base_adapter.py +50 -35
  16. kiln_ai/adapters/model_adapters/langchain_adapters.py +27 -20
  17. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -1
  18. kiln_ai/adapters/model_adapters/openai_model_adapter.py +93 -50
  19. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  20. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +7 -14
  21. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +55 -64
  22. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  23. kiln_ai/adapters/model_adapters/test_structured_output.py +36 -30
  24. kiln_ai/adapters/ollama_tools.py +0 -1
  25. kiln_ai/adapters/prompt_builders.py +80 -42
  26. kiln_ai/adapters/repair/repair_task.py +9 -21
  27. kiln_ai/adapters/repair/test_repair_task.py +3 -3
  28. kiln_ai/adapters/run_output.py +3 -0
  29. kiln_ai/adapters/test_adapter_registry.py +10 -10
  30. kiln_ai/adapters/test_generate_docs.py +6 -6
  31. kiln_ai/adapters/test_ollama_tools.py +0 -1
  32. kiln_ai/adapters/test_prompt_adaptors.py +17 -14
  33. kiln_ai/adapters/test_prompt_builders.py +91 -31
  34. kiln_ai/datamodel/__init__.py +50 -952
  35. kiln_ai/datamodel/datamodel_enums.py +58 -0
  36. kiln_ai/datamodel/dataset_filters.py +114 -0
  37. kiln_ai/datamodel/dataset_split.py +170 -0
  38. kiln_ai/datamodel/eval.py +298 -0
  39. kiln_ai/datamodel/finetune.py +105 -0
  40. kiln_ai/datamodel/json_schema.py +6 -0
  41. kiln_ai/datamodel/project.py +23 -0
  42. kiln_ai/datamodel/prompt.py +37 -0
  43. kiln_ai/datamodel/prompt_id.py +83 -0
  44. kiln_ai/datamodel/strict_mode.py +24 -0
  45. kiln_ai/datamodel/task.py +181 -0
  46. kiln_ai/datamodel/task_output.py +321 -0
  47. kiln_ai/datamodel/task_run.py +164 -0
  48. kiln_ai/datamodel/test_basemodel.py +10 -11
  49. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  50. kiln_ai/datamodel/test_dataset_split.py +32 -8
  51. kiln_ai/datamodel/test_datasource.py +3 -2
  52. kiln_ai/datamodel/test_eval_model.py +635 -0
  53. kiln_ai/datamodel/test_example_models.py +9 -13
  54. kiln_ai/datamodel/test_json_schema.py +23 -0
  55. kiln_ai/datamodel/test_models.py +2 -2
  56. kiln_ai/datamodel/test_prompt_id.py +129 -0
  57. kiln_ai/datamodel/test_task.py +159 -0
  58. kiln_ai/utils/config.py +6 -1
  59. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +37 -1
  60. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  61. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  62. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  63. {kiln_ai-0.11.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -18,14 +18,14 @@ def test_valid_synthetic_data_source():
18
18
  properties={
19
19
  "model_name": "GPT-4",
20
20
  "model_provider": "OpenAI",
21
- "prompt_builder_name": "completion",
21
+ "prompt_id": "simple_prompt_builder",
22
22
  "adapter_name": "langchain",
23
23
  },
24
24
  )
25
25
  assert data_source.type == DataSourceType.synthetic
26
26
  assert data_source.properties["model_name"] == "GPT-4"
27
27
  assert data_source.properties["model_provider"] == "OpenAI"
28
- assert data_source.properties["prompt_builder_name"] == "completion"
28
+ assert data_source.properties["prompt_id"] == "simple_prompt_builder"
29
29
  assert data_source.properties["adapter_name"] == "langchain"
30
30
 
31
31
 
@@ -85,6 +85,7 @@ def test_prompt_type_optional_for_synthetic():
85
85
  },
86
86
  )
87
87
  assert "prompt_builder_name" not in data_source.properties
88
+ assert "prompt_id" not in data_source.properties
88
89
 
89
90
 
90
91
  def test_private_data_source_properties_not_serialized():
@@ -0,0 +1,635 @@
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from kiln_ai.datamodel import BasePrompt
5
+ from kiln_ai.datamodel.basemodel import KilnParentModel
6
+ from kiln_ai.datamodel.eval import (
7
+ Eval,
8
+ EvalConfig,
9
+ EvalConfigType,
10
+ EvalOutputScore,
11
+ EvalRun,
12
+ )
13
+ from kiln_ai.datamodel.task import Task
14
+ from kiln_ai.datamodel.task_output import (
15
+ TaskOutputRatingType,
16
+ )
17
+
18
+
19
+ @pytest.fixture
20
+ def mock_task():
21
+ return Task(name="Test Task", instruction="Test instruction")
22
+
23
+
24
+ @pytest.fixture
25
+ def valid_eval_config_data():
26
+ return {
27
+ "name": "Test Eval Config",
28
+ "config_type": EvalConfigType.g_eval,
29
+ "properties": {"eval_steps": ["step1", "step2"]},
30
+ "model_name": "gpt-4",
31
+ "model_provider": "openai",
32
+ }
33
+
34
+
35
+ @pytest.fixture
36
+ def valid_eval_config(valid_eval_config_data):
37
+ return EvalConfig(**valid_eval_config_data)
38
+
39
+
40
+ def test_eval_config_valid(valid_eval_config):
41
+ assert valid_eval_config.name == "Test Eval Config"
42
+ assert valid_eval_config.config_type == EvalConfigType.g_eval
43
+ assert valid_eval_config.properties["eval_steps"] == ["step1", "step2"]
44
+ assert valid_eval_config.model_name == "gpt-4"
45
+ assert valid_eval_config.model_provider == "openai"
46
+
47
+
48
+ def test_eval_config_missing_eval_steps(valid_eval_config):
49
+ with pytest.raises(
50
+ ValueError, match="eval_steps is required and must be a list for g_eval"
51
+ ):
52
+ valid_eval_config.properties = {}
53
+
54
+
55
+ def test_eval_config_missing_task_description(valid_eval_config):
56
+ with pytest.raises(
57
+ ValueError,
58
+ match="task_description is optional, but if provided must be a string",
59
+ ):
60
+ valid_eval_config.properties = {"task_description": 123, "eval_steps": []}
61
+
62
+
63
+ def test_eval_config_invalid_json(valid_eval_config):
64
+ class InvalidClass:
65
+ pass
66
+
67
+ with pytest.raises(ValueError, match="Properties must be JSON serializable"):
68
+ valid_eval_config.properties = {
69
+ "eval_steps": [],
70
+ "invalid_key": InvalidClass(),
71
+ }
72
+
73
+
74
+ def test_eval_config_invalid_eval_steps_type(valid_eval_config):
75
+ with pytest.raises(
76
+ ValueError, match="eval_steps is required and must be a list for g_eval"
77
+ ):
78
+ valid_eval_config.properties = {"eval_steps": "not a list"}
79
+
80
+
81
+ def test_eval_config_invalid_config_type(valid_eval_config):
82
+ # Create an invalid config type using string
83
+ with pytest.raises(ValueError):
84
+ valid_eval_config.config_type = "invalid_type"
85
+
86
+
87
+ def test_eval_basic_properties():
88
+ eval = Eval(
89
+ name="Test Eval",
90
+ description="Test Description",
91
+ current_config_id="config123",
92
+ eval_set_filter_id="tag::tag1",
93
+ eval_configs_filter_id="tag::tag2",
94
+ output_scores=[
95
+ EvalOutputScore(
96
+ name="accuracy",
97
+ type=TaskOutputRatingType.five_star,
98
+ )
99
+ ],
100
+ )
101
+
102
+ assert eval.name == "Test Eval"
103
+ assert eval.description == "Test Description"
104
+ assert eval.current_config_id == "config123"
105
+ assert eval.output_scores[0].name == "accuracy"
106
+ assert eval.output_scores[0].type == TaskOutputRatingType.five_star
107
+
108
+
109
+ def test_eval_default_values():
110
+ eval = Eval(
111
+ name="Test Eval",
112
+ eval_set_filter_id="tag::tag1",
113
+ eval_configs_filter_id="tag::tag2",
114
+ output_scores=[
115
+ EvalOutputScore(
116
+ name="quality",
117
+ type=TaskOutputRatingType.pass_fail,
118
+ )
119
+ ],
120
+ )
121
+
122
+ assert eval.description is None
123
+ assert eval.current_config_id is None
124
+
125
+
126
+ def test_eval_parent_task_relationship(mock_task, valid_eval_config_data):
127
+ eval = Eval(
128
+ name="Test Eval",
129
+ parent=mock_task,
130
+ eval_set_filter_id="tag::tag1",
131
+ eval_configs_filter_id="tag::tag2",
132
+ output_scores=[
133
+ EvalOutputScore(
134
+ name="score",
135
+ type=TaskOutputRatingType.pass_fail,
136
+ )
137
+ ],
138
+ )
139
+ config = EvalConfig(parent=eval, **valid_eval_config_data)
140
+
141
+ assert eval.parent_task() == mock_task
142
+ assert eval.parent == mock_task
143
+ assert config.parent == eval
144
+ assert config.parent_eval() == eval
145
+
146
+
147
+ def test_eval_parent_task_none():
148
+ eval = Eval(
149
+ name="Test Eval",
150
+ eval_set_filter_id="tag::tag1",
151
+ eval_configs_filter_id="tag::tag2",
152
+ output_scores=[
153
+ EvalOutputScore(
154
+ name="score",
155
+ type=TaskOutputRatingType.pass_fail,
156
+ )
157
+ ],
158
+ )
159
+ assert eval.parent_task() is None
160
+
161
+
162
+ def test_eval_parent_task_wrong_type():
163
+ # Create a non-Task parent
164
+ class DummyParent(KilnParentModel, parent_of={}):
165
+ pass
166
+
167
+ with pytest.raises(ValueError):
168
+ Eval(name="Test Eval", parent=DummyParent())
169
+
170
+
171
+ def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_path):
172
+ task_path = tmp_path / "task.kiln"
173
+ mock_task.path = task_path
174
+ mock_task.save_to_file()
175
+
176
+ eval = Eval(
177
+ name="Test Eval",
178
+ parent=mock_task,
179
+ eval_set_filter_id="tag::tag1",
180
+ eval_configs_filter_id="tag::tag2",
181
+ output_scores=[
182
+ EvalOutputScore(
183
+ name="accuracy",
184
+ type=TaskOutputRatingType.pass_fail,
185
+ )
186
+ ],
187
+ )
188
+ eval.save_to_file()
189
+
190
+ # Add config using the parent relationship
191
+ config = EvalConfig(parent=eval, **valid_eval_config_data)
192
+ config.save_to_file()
193
+
194
+ run = EvalRun(
195
+ parent=config,
196
+ dataset_id="dataset123",
197
+ task_run_config_id="config456",
198
+ input='{"key": "value"}',
199
+ output='{"result": "success"}',
200
+ scores={"accuracy": 0.95},
201
+ )
202
+ run.save_to_file()
203
+
204
+ # Test configs can be retrieved from disk
205
+ evals = mock_task.evals()
206
+ assert len(evals) == 1
207
+ assert evals[0].name == "Test Eval"
208
+ configs = evals[0].configs()
209
+ assert len(configs) == 1
210
+ assert configs[0].model_provider == "openai"
211
+ assert configs[0].model_name == "gpt-4"
212
+
213
+ # and back up
214
+ assert configs[0].parent_eval().parent_task().path == task_path
215
+
216
+ # Test runs can be retrieved from disk
217
+ runs = configs[0].runs()
218
+ assert len(runs) == 1
219
+ assert runs[0].dataset_id == "dataset123"
220
+ assert runs[0].task_run_config_id == "config456"
221
+ assert runs[0].input == '{"key": "value"}'
222
+ assert runs[0].output == '{"result": "success"}'
223
+ assert runs[0].scores == {"accuracy": 0.95}
224
+
225
+ # and back up
226
+ assert runs[0].parent_eval_config().parent_eval().parent_task().path == task_path
227
+
228
+
229
+ def test_eval_run_valid_creation():
230
+ """Test creating an EvalRun with valid data"""
231
+ eval_run = EvalRun(
232
+ dataset_id="dataset123",
233
+ task_run_config_id="config456",
234
+ input='{"key": "value"}', # JSON formatted input
235
+ output='{"result": "success"}', # JSON formatted output
236
+ scores={"accuracy": 0.95},
237
+ )
238
+
239
+ assert eval_run.dataset_id == "dataset123"
240
+ assert eval_run.task_run_config_id == "config456"
241
+ assert eval_run.input == '{"key": "value"}'
242
+ assert eval_run.output == '{"result": "success"}'
243
+ assert eval_run.scores == {"accuracy": 0.95}
244
+
245
+
246
+ def test_eval_run_plaintext():
247
+ """Test creating an EvalRun with plaintext input/output"""
248
+ eval_run = EvalRun(
249
+ dataset_id="dataset123",
250
+ task_run_config_id="config456",
251
+ input="What is the capital of France?",
252
+ output="The capital of France is Paris.",
253
+ scores={"accuracy": 1.0},
254
+ )
255
+
256
+ assert eval_run.input == "What is the capital of France?"
257
+ assert eval_run.output == "The capital of France is Paris."
258
+
259
+
260
+ def test_eval_run_missing_required_fields():
261
+ """Test that omitting required fields raises ValidationError"""
262
+ with pytest.raises(ValidationError) as exc_info:
263
+ EvalRun(
264
+ dataset_id="dataset123",
265
+ # missing task_run_config_id
266
+ input="test",
267
+ output="test",
268
+ scores={"score": 1.0},
269
+ )
270
+
271
+ assert "task_run_config_id" in str(exc_info.value)
272
+
273
+
274
+ def test_eval_run_invalid_scores():
275
+ """Test that scores must be a dict of floats"""
276
+ with pytest.raises(ValidationError):
277
+ EvalRun(
278
+ dataset_id="dataset123",
279
+ task_run_config_id="config456",
280
+ input="test",
281
+ output="test",
282
+ scores={"score": "not a float"}, # invalid score type
283
+ )
284
+
285
+
286
+ def test_eval_missing_output_scores():
287
+ """Test that eval creation fails when output_scores is missing"""
288
+ with pytest.raises(ValidationError) as exc_info:
289
+ Eval(
290
+ name="Test Eval",
291
+ eval_set_filter_id="tag::tag1",
292
+ eval_configs_filter_id="tag::tag2",
293
+ )
294
+ assert "output_scores" in str(exc_info.value)
295
+
296
+
297
+ def test_eval_empty_output_scores():
298
+ """Test that eval creation fails when output_scores is empty"""
299
+ with pytest.raises(
300
+ ValueError, match="output_scores are required, and must have at least one score"
301
+ ):
302
+ Eval(
303
+ name="Test Eval",
304
+ eval_set_filter_id="tag::tag1",
305
+ eval_configs_filter_id="tag::tag2",
306
+ output_scores=[],
307
+ )
308
+
309
+
310
+ def test_eval_duplicate_output_scores():
311
+ """Test that eval creation fails when output_scores has duplicate names"""
312
+ with pytest.raises(
313
+ ValueError,
314
+ match="must have unique names",
315
+ ):
316
+ Eval(
317
+ name="Test Eval",
318
+ eval_set_filter_id="tag::tag1",
319
+ eval_configs_filter_id="tag::tag2",
320
+ output_scores=[
321
+ EvalOutputScore(
322
+ name="score",
323
+ type=TaskOutputRatingType.five_star,
324
+ ),
325
+ EvalOutputScore(name="SCORE", type=TaskOutputRatingType.pass_fail),
326
+ ],
327
+ )
328
+
329
+
330
+ def test_eval_invalid_score_type():
331
+ """Test that eval creation fails with invalid rating type in output_scores"""
332
+ with pytest.raises(
333
+ ValueError,
334
+ match="Input should be 'five_star', 'pass_fail', 'pass_fail_critical'",
335
+ ):
336
+ Eval(
337
+ name="Test Eval",
338
+ eval_set_filter_id="tag::tag1",
339
+ eval_configs_filter_id="tag::tag2",
340
+ output_scores=[
341
+ EvalOutputScore(
342
+ name="score",
343
+ type="invalid_type",
344
+ )
345
+ ],
346
+ )
347
+
348
+
349
+ def test_eval_valid_output_scores():
350
+ """Test that eval creation succeeds with valid output_scores"""
351
+ eval = Eval(
352
+ name="Test Eval",
353
+ eval_set_filter_id="tag::tag1",
354
+ eval_configs_filter_id="tag::tag2",
355
+ output_scores=[
356
+ EvalOutputScore(
357
+ name="accuracy",
358
+ type=TaskOutputRatingType.five_star,
359
+ ),
360
+ EvalOutputScore(
361
+ name="critical_check",
362
+ type=TaskOutputRatingType.pass_fail_critical,
363
+ ),
364
+ EvalOutputScore(name="basic_check", type=TaskOutputRatingType.pass_fail),
365
+ ],
366
+ )
367
+ assert len(eval.output_scores) == 3
368
+ assert eval.output_scores[0].type == TaskOutputRatingType.five_star
369
+ assert eval.output_scores[0].name == "accuracy"
370
+ assert eval.output_scores[1].type == TaskOutputRatingType.pass_fail_critical
371
+ assert eval.output_scores[1].name == "critical_check"
372
+ assert eval.output_scores[2].type == TaskOutputRatingType.pass_fail
373
+ assert eval.output_scores[2].name == "basic_check"
374
+
375
+
376
+ @pytest.fixture
377
+ def valid_eval_run_data():
378
+ return {
379
+ "dataset_id": "dataset123",
380
+ "task_run_config_id": "config456",
381
+ "input": "test input",
382
+ "output": "test output",
383
+ "scores": {"accuracy": 4.5},
384
+ }
385
+
386
+
387
+ def test_eval_run_five_star_score_validation(valid_eval_config, valid_eval_run_data):
388
+ # Setup eval with five_star rating
389
+ eval = Eval(
390
+ name="Test Eval",
391
+ eval_set_filter_id="tag::tag1",
392
+ eval_configs_filter_id="tag::tag2",
393
+ output_scores=[
394
+ EvalOutputScore(
395
+ name="accuracy",
396
+ type=TaskOutputRatingType.five_star,
397
+ )
398
+ ],
399
+ )
400
+ valid_eval_config.parent = eval
401
+
402
+ # Valid score
403
+ run = EvalRun(parent=valid_eval_config, **valid_eval_run_data)
404
+ assert run.scores["accuracy"] == 4.5
405
+
406
+ # Invalid scores
407
+ with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"):
408
+ run = EvalRun(
409
+ parent=valid_eval_config,
410
+ **{**valid_eval_run_data, "scores": {"accuracy": 0.5}},
411
+ )
412
+
413
+ with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"):
414
+ run = EvalRun(
415
+ parent=valid_eval_config,
416
+ **{**valid_eval_run_data, "scores": {"accuracy": 5.5}},
417
+ )
418
+
419
+
420
+ def test_eval_run_pass_fail_score_validation(valid_eval_config, valid_eval_run_data):
421
+ # Setup eval with pass_fail rating
422
+ eval = Eval(
423
+ name="Test Eval",
424
+ eval_set_filter_id="tag::tag1",
425
+ eval_configs_filter_id="tag::tag2",
426
+ output_scores=[
427
+ EvalOutputScore(
428
+ name="check",
429
+ type=TaskOutputRatingType.pass_fail,
430
+ )
431
+ ],
432
+ )
433
+ valid_eval_config.parent = eval
434
+
435
+ # Valid scores
436
+ run = EvalRun(
437
+ parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 1.0}}
438
+ )
439
+ assert run.scores["check"] == 1.0
440
+
441
+ run = EvalRun(
442
+ parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 0.0}}
443
+ )
444
+ assert run.scores["check"] == 0.0
445
+
446
+ # Invalid scores
447
+ with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"):
448
+ run = EvalRun(
449
+ parent=valid_eval_config,
450
+ **{**valid_eval_run_data, "scores": {"check": -0.1}},
451
+ )
452
+
453
+ with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"):
454
+ run = EvalRun(
455
+ parent=valid_eval_config,
456
+ **{**valid_eval_run_data, "scores": {"check": 1.1}},
457
+ )
458
+
459
+
460
+ def test_eval_run_pass_fail_critical_score_validation(
461
+ valid_eval_config, valid_eval_run_data
462
+ ):
463
+ # Setup eval with pass_fail_critical rating
464
+ eval = Eval(
465
+ name="Test Eval",
466
+ eval_set_filter_id="tag::tag1",
467
+ eval_configs_filter_id="tag::tag2",
468
+ output_scores=[
469
+ EvalOutputScore(
470
+ name="critical",
471
+ type=TaskOutputRatingType.pass_fail_critical,
472
+ )
473
+ ],
474
+ )
475
+ valid_eval_config.parent = eval
476
+
477
+ # Valid scores
478
+ run = EvalRun(
479
+ parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"critical": 1.0}}
480
+ )
481
+ assert run.scores["critical"] == 1.0
482
+
483
+ run = EvalRun(
484
+ parent=valid_eval_config,
485
+ **{**valid_eval_run_data, "scores": {"critical": -1.0}},
486
+ )
487
+ assert run.scores["critical"] == -1.0
488
+
489
+ # Invalid scores
490
+ with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"):
491
+ run = EvalRun(
492
+ parent=valid_eval_config,
493
+ **{**valid_eval_run_data, "scores": {"critical": -1.1}},
494
+ )
495
+
496
+ with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"):
497
+ run = EvalRun(
498
+ parent=valid_eval_config,
499
+ **{**valid_eval_run_data, "scores": {"critical": 1.1}},
500
+ )
501
+
502
+
503
+ def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data):
504
+ eval = Eval(
505
+ name="Test Eval",
506
+ eval_set_filter_id="tag::tag1",
507
+ eval_configs_filter_id="tag::tag2",
508
+ output_scores=[
509
+ EvalOutputScore(
510
+ name="accuracy",
511
+ type=TaskOutputRatingType.five_star,
512
+ ),
513
+ EvalOutputScore(
514
+ name="critical",
515
+ type=TaskOutputRatingType.pass_fail_critical,
516
+ ),
517
+ ],
518
+ )
519
+ valid_eval_config.parent = eval
520
+
521
+ # Correct
522
+ run = EvalRun(
523
+ parent=valid_eval_config,
524
+ **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}},
525
+ )
526
+
527
+ # Correct but wrong order still okay
528
+ run = EvalRun(
529
+ parent=valid_eval_config,
530
+ **{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}},
531
+ )
532
+
533
+ # Missing score
534
+ with pytest.raises(
535
+ ValueError,
536
+ match="The scores produced by the evaluator must match the scores expected by the eval",
537
+ ):
538
+ run = EvalRun(
539
+ parent=valid_eval_config,
540
+ **{**valid_eval_run_data, "scores": {"accuracy": 4.5}},
541
+ )
542
+
543
+ # Extra score
544
+ with pytest.raises(
545
+ ValueError,
546
+ match="The scores produced by the evaluator must match the scores expected by the eval",
547
+ ):
548
+ run = EvalRun(
549
+ parent=valid_eval_config,
550
+ **{
551
+ **valid_eval_run_data,
552
+ "scores": {"accuracy": 4.5, "critical": 1.0, "extra": 1.0},
553
+ },
554
+ )
555
+
556
+ # Missing score w matching count
557
+ with pytest.raises(
558
+ ValueError,
559
+ match="The scores produced by the evaluator must match the scores expected by the eval",
560
+ ):
561
+ run = EvalRun(
562
+ parent=valid_eval_config,
563
+ **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}},
564
+ )
565
+
566
+
567
+ def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_data):
568
+ with pytest.raises(
569
+ ValueError, match="Custom scores are not supported in evaluators"
570
+ ):
571
+ eval = Eval(
572
+ name="Test Eval",
573
+ eval_set_filter_id="tag::tag1",
574
+ eval_configs_filter_id="tag::tag2",
575
+ output_scores=[
576
+ EvalOutputScore(
577
+ name="custom",
578
+ type=TaskOutputRatingType.custom,
579
+ )
580
+ ],
581
+ )
582
+
583
+
584
+ def test_eval_run_eval_config_eval_validation():
585
+ """Test that eval_config_eval and task_run_config_id validation works correctly"""
586
+
587
+ # Case 1: Valid configuration - eval_config_eval=True and task_run_config_id=None
588
+ valid_run1 = EvalRun(
589
+ dataset_id="dataset123",
590
+ eval_config_eval=True,
591
+ task_run_config_id=None,
592
+ input="test input",
593
+ output="test output",
594
+ scores={"score": 1.0},
595
+ )
596
+ assert valid_run1.eval_config_eval is True
597
+ assert valid_run1.task_run_config_id is None
598
+
599
+ # Case 2: Valid configuration - eval_config_eval=False and task_run_config_id is set
600
+ valid_run2 = EvalRun(
601
+ dataset_id="dataset123",
602
+ eval_config_eval=False,
603
+ task_run_config_id="config456",
604
+ input="test input",
605
+ output="test output",
606
+ scores={"score": 1.0},
607
+ )
608
+ assert valid_run2.eval_config_eval is False
609
+ assert valid_run2.task_run_config_id == "config456"
610
+
611
+ # Case 3: Invalid configuration - eval_config_eval=True but task_run_config_id is set
612
+ with pytest.raises(
613
+ ValueError, match="task_run_config_id must be None if eval_config_eval is true"
614
+ ):
615
+ EvalRun(
616
+ dataset_id="dataset123",
617
+ eval_config_eval=True,
618
+ task_run_config_id="config456",
619
+ input="test input",
620
+ output="test output",
621
+ scores={"score": 1.0},
622
+ )
623
+
624
+ # Case 4: Invalid configuration - eval_config_eval=False but task_run_config_id is None
625
+ with pytest.raises(
626
+ ValueError, match="task_run_config_id must be set if eval_config_eval is false"
627
+ ):
628
+ EvalRun(
629
+ dataset_id="dataset123",
630
+ eval_config_eval=False,
631
+ task_run_config_id=None,
632
+ input="test input",
633
+ output="test output",
634
+ scores={"score": 1.0},
635
+ )