kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show
  1. kiln_ai/adapters/__init__.py +7 -7
  2. kiln_ai/adapters/adapter_registry.py +81 -10
  3. kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +267 -0
  7. kiln_ai/adapters/eval/g_eval.py +367 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +324 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +640 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +497 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
  15. kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
  16. kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
  17. kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
  18. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
  19. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
  20. kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
  21. kiln_ai/adapters/ml_model_list.py +434 -93
  22. kiln_ai/adapters/model_adapters/__init__.py +18 -0
  23. kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
  24. kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
  25. kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
  26. kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
  27. kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
  28. kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
  29. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
  30. kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
  31. kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
  32. kiln_ai/adapters/ollama_tools.py +0 -1
  33. kiln_ai/adapters/parsers/__init__.py +10 -0
  34. kiln_ai/adapters/parsers/base_parser.py +12 -0
  35. kiln_ai/adapters/parsers/json_parser.py +37 -0
  36. kiln_ai/adapters/parsers/parser_registry.py +19 -0
  37. kiln_ai/adapters/parsers/r1_parser.py +69 -0
  38. kiln_ai/adapters/parsers/test_json_parser.py +81 -0
  39. kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
  40. kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
  41. kiln_ai/adapters/prompt_builders.py +193 -49
  42. kiln_ai/adapters/provider_tools.py +91 -36
  43. kiln_ai/adapters/repair/repair_task.py +18 -19
  44. kiln_ai/adapters/repair/test_repair_task.py +7 -7
  45. kiln_ai/adapters/run_output.py +11 -0
  46. kiln_ai/adapters/test_adapter_registry.py +177 -0
  47. kiln_ai/adapters/test_generate_docs.py +69 -0
  48. kiln_ai/adapters/test_ollama_tools.py +0 -1
  49. kiln_ai/adapters/test_prompt_adaptors.py +25 -18
  50. kiln_ai/adapters/test_prompt_builders.py +265 -44
  51. kiln_ai/adapters/test_provider_tools.py +268 -46
  52. kiln_ai/datamodel/__init__.py +51 -772
  53. kiln_ai/datamodel/basemodel.py +31 -11
  54. kiln_ai/datamodel/datamodel_enums.py +58 -0
  55. kiln_ai/datamodel/dataset_filters.py +114 -0
  56. kiln_ai/datamodel/dataset_split.py +170 -0
  57. kiln_ai/datamodel/eval.py +298 -0
  58. kiln_ai/datamodel/finetune.py +105 -0
  59. kiln_ai/datamodel/json_schema.py +14 -3
  60. kiln_ai/datamodel/model_cache.py +8 -3
  61. kiln_ai/datamodel/project.py +23 -0
  62. kiln_ai/datamodel/prompt.py +37 -0
  63. kiln_ai/datamodel/prompt_id.py +83 -0
  64. kiln_ai/datamodel/strict_mode.py +24 -0
  65. kiln_ai/datamodel/task.py +181 -0
  66. kiln_ai/datamodel/task_output.py +321 -0
  67. kiln_ai/datamodel/task_run.py +164 -0
  68. kiln_ai/datamodel/test_basemodel.py +80 -2
  69. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  70. kiln_ai/datamodel/test_dataset_split.py +127 -6
  71. kiln_ai/datamodel/test_datasource.py +3 -2
  72. kiln_ai/datamodel/test_eval_model.py +635 -0
  73. kiln_ai/datamodel/test_example_models.py +34 -17
  74. kiln_ai/datamodel/test_json_schema.py +23 -0
  75. kiln_ai/datamodel/test_model_cache.py +24 -0
  76. kiln_ai/datamodel/test_model_perf.py +125 -0
  77. kiln_ai/datamodel/test_models.py +131 -2
  78. kiln_ai/datamodel/test_prompt_id.py +129 -0
  79. kiln_ai/datamodel/test_task.py +159 -0
  80. kiln_ai/utils/config.py +6 -1
  81. kiln_ai/utils/exhaustive_error.py +6 -0
  82. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
  83. kiln_ai-0.12.0.dist-info/RECORD +100 -0
  84. kiln_ai/adapters/base_adapter.py +0 -191
  85. kiln_ai/adapters/langchain_adapters.py +0 -256
  86. kiln_ai-0.8.1.dist-info/RECORD +0 -58
  87. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
  88. {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,640 @@
1
+ from typing import Dict
2
+ from unittest.mock import AsyncMock, patch
3
+
4
+ import pytest
5
+ from kiln_ai.adapters.eval.base_eval import BaseEval
6
+ from kiln_ai.adapters.eval.eval_runner import EvalJob, EvalRunner
7
+ from kiln_ai.datamodel import (
8
+ DataSource,
9
+ DataSourceType,
10
+ Task,
11
+ TaskOutput,
12
+ TaskOutputRatingType,
13
+ TaskRun,
14
+ )
15
+ from kiln_ai.datamodel.eval import (
16
+ Eval,
17
+ EvalConfig,
18
+ EvalOutputScore,
19
+ EvalRun,
20
+ EvalScores,
21
+ )
22
+ from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
23
+
24
+
25
+ @pytest.fixture
26
+ def mock_task(tmp_path):
27
+ task = Task(
28
+ name="test",
29
+ description="test",
30
+ instruction="do the thing",
31
+ path=tmp_path / "task.kiln",
32
+ )
33
+ task.save_to_file()
34
+ return task
35
+
36
+
37
+ @pytest.fixture
38
+ def mock_eval(mock_task):
39
+ eval = Eval(
40
+ id="test",
41
+ name="test",
42
+ description="test",
43
+ eval_set_filter_id="all",
44
+ eval_configs_filter_id="all",
45
+ output_scores=[
46
+ EvalOutputScore(
47
+ name="Accuracy",
48
+ instruction="Check if the output is accurate",
49
+ type=TaskOutputRatingType.pass_fail,
50
+ ),
51
+ ],
52
+ parent=mock_task,
53
+ )
54
+ eval.save_to_file()
55
+ return eval
56
+
57
+
58
+ @pytest.fixture
59
+ def data_source():
60
+ return DataSource(
61
+ type=DataSourceType.synthetic,
62
+ properties={
63
+ "model_name": "gpt-4",
64
+ "model_provider": "openai",
65
+ "adapter_name": "test_adapter",
66
+ },
67
+ )
68
+
69
+
70
+ @pytest.fixture
71
+ def mock_eval_config(mock_eval):
72
+ eval_config = EvalConfig(
73
+ name="test",
74
+ model_name="gpt-4",
75
+ model_provider="openai",
76
+ parent=mock_eval,
77
+ properties={
78
+ "eval_steps": ["step1", "step2", "step3"],
79
+ },
80
+ )
81
+ eval_config.save_to_file()
82
+ return eval_config
83
+
84
+
85
+ @pytest.fixture
86
+ def mock_run_config(
87
+ mock_task,
88
+ ):
89
+ rc = TaskRunConfig(
90
+ name="test",
91
+ description="test",
92
+ run_config_properties=RunConfigProperties(
93
+ model_name="gpt-4",
94
+ model_provider_name="openai",
95
+ prompt_id="simple_prompt_builder",
96
+ ),
97
+ parent=mock_task,
98
+ )
99
+ rc.save_to_file()
100
+ return rc
101
+
102
+
103
+ @pytest.fixture
104
+ def mock_eval_runner(mock_eval, mock_task, mock_eval_config, mock_run_config):
105
+ return EvalRunner(
106
+ eval_configs=[mock_eval_config],
107
+ run_configs=[mock_run_config],
108
+ eval_run_type="task_run_eval",
109
+ )
110
+
111
+
112
+ # Test with and without concurrency
113
+ @pytest.mark.parametrize("concurrency", [1, 25])
114
+ @pytest.mark.asyncio
115
+ async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency):
116
+ # Real async testing!
117
+
118
+ job_count = 50
119
+ # Job objects are not the right type, but since we're mocking run_job, it doesn't matter
120
+ jobs = [{} for _ in range(job_count)]
121
+
122
+ # Mock collect_tasks to return our fake jobs
123
+ mock_eval_runner.collect_tasks = lambda: jobs
124
+
125
+ # Mock run_job to return True immediately
126
+ mock_eval_runner.run_job = AsyncMock(return_value=True)
127
+
128
+ # Expect the status updates in order, and 1 for each job
129
+ expected_compelted_count = 0
130
+ async for progress in mock_eval_runner.run(concurrency=concurrency):
131
+ assert progress.complete == expected_compelted_count
132
+ expected_compelted_count += 1
133
+ assert progress.errors == 0
134
+ assert progress.total == job_count
135
+
136
+ # Verify last status update was complete
137
+ assert expected_compelted_count == job_count + 1
138
+
139
+ # Verify run_job was called for each job
140
+ assert mock_eval_runner.run_job.call_count == job_count
141
+
142
+
143
+ def test_collect_tasks_filtering(
144
+ mock_eval,
145
+ mock_eval_runner,
146
+ mock_task,
147
+ mock_eval_config,
148
+ data_source,
149
+ mock_run_config,
150
+ ):
151
+ """Test that tasks are properly filtered based on eval filters"""
152
+ tags = ["tag1", "tag2", "tag3"]
153
+ task_runs = []
154
+ for tag in tags:
155
+ # Create some task runs with different tags
156
+ task_run = TaskRun(
157
+ parent=mock_task,
158
+ input="test1",
159
+ input_source=data_source,
160
+ output=TaskOutput(
161
+ output="test1",
162
+ ),
163
+ tags=[tag],
164
+ )
165
+ task_run.save_to_file()
166
+ task_runs.append(task_run)
167
+
168
+ mock_eval.eval_set_filter_id = "tag::tag1"
169
+ mock_eval.eval_configs_filter_id = "tag::tag2"
170
+
171
+ # Create a new runner of type task run eval
172
+ runner = EvalRunner(
173
+ eval_configs=[mock_eval_config],
174
+ run_configs=[mock_run_config],
175
+ eval_run_type="task_run_eval",
176
+ )
177
+ jobs = runner.collect_tasks()
178
+
179
+ # Should only get task_run1 jobs, the one with tag1
180
+ assert len(jobs) == 1
181
+ job = jobs[0]
182
+ # job should be the tag1 item, and setup as a task run eval for mock_run_config
183
+ assert job.item.tags == ["tag1"]
184
+ assert job.task_run_config.id == mock_run_config.id
185
+ assert job.eval_config.id == mock_eval_config.id
186
+
187
+ # Change to an eval config set filter
188
+ runner = EvalRunner(
189
+ eval_configs=[mock_eval_config],
190
+ run_configs=None,
191
+ eval_run_type="eval_config_eval",
192
+ )
193
+ jobs = runner.collect_tasks()
194
+
195
+ # Should only get eval_config1 jobs
196
+ assert len(jobs) == 1
197
+ job = jobs[0]
198
+ # job should be the tag2 item, and setup as a eval config eval for mock_eval_config
199
+ assert job.item.tags == ["tag2"]
200
+ assert job.eval_config.id == mock_eval_config.id
201
+ assert job.task_run_config is None
202
+
203
+ # Add a second task run config, and call a new runner with multiple run configs
204
+ rc = TaskRunConfig(
205
+ name="test2",
206
+ description="test2",
207
+ run_config_properties=RunConfigProperties(
208
+ model_name="gpt-4",
209
+ model_provider_name="openai",
210
+ prompt_id="simple_prompt_builder",
211
+ ),
212
+ parent=mock_task,
213
+ )
214
+ rc.save_to_file()
215
+ runner = EvalRunner(
216
+ eval_configs=[mock_eval_config],
217
+ run_configs=[mock_run_config, rc],
218
+ eval_run_type="task_run_eval",
219
+ )
220
+ jobs = runner.collect_tasks()
221
+ assert len(jobs) == 2
222
+ for job in jobs:
223
+ assert job.item.tags == ["tag1"]
224
+ assert job.task_run_config.id in [mock_run_config.id, rc.id]
225
+ assert job.eval_config.id == mock_eval_config.id
226
+ assert jobs[0].task_run_config.id != jobs[1].task_run_config.id
227
+
228
+ # add a second eval config, and call a new runner with multiple eval configs
229
+ eval_config = EvalConfig(
230
+ name="test2",
231
+ model_name="gpt-4",
232
+ model_provider="openai",
233
+ parent=mock_eval,
234
+ properties={
235
+ "eval_steps": ["step1", "step2", "step3"],
236
+ },
237
+ )
238
+ eval_config.save_to_file()
239
+ runner = EvalRunner(
240
+ eval_configs=[mock_eval_config, eval_config],
241
+ run_configs=None,
242
+ eval_run_type="eval_config_eval",
243
+ )
244
+ jobs = runner.collect_tasks()
245
+ # Check we get 2 jobs, one for each eval config
246
+ assert len(jobs) == 2
247
+ for job in jobs:
248
+ assert job.item.tags == ["tag2"]
249
+ assert job.eval_config.id in [mock_eval_config.id, eval_config.id]
250
+ assert job.task_run_config is None
251
+ assert jobs[0].eval_config.id != jobs[1].eval_config.id
252
+
253
+
254
+ def test_validate_same_task(
255
+ mock_eval_runner,
256
+ mock_task,
257
+ data_source,
258
+ tmp_path,
259
+ mock_eval_config,
260
+ mock_run_config,
261
+ ):
262
+ # second eval config has a different task
263
+ eval_config = EvalConfig(
264
+ name="test2",
265
+ model_name="gpt-4",
266
+ model_provider="openai",
267
+ properties={
268
+ "eval_steps": ["step1", "step2", "step3"],
269
+ },
270
+ parent=Eval(
271
+ name="test",
272
+ description="test",
273
+ eval_set_filter_id="all",
274
+ eval_configs_filter_id="all",
275
+ output_scores=[
276
+ EvalOutputScore(
277
+ name="Accuracy",
278
+ instruction="Check if the output is accurate",
279
+ type=TaskOutputRatingType.pass_fail,
280
+ ),
281
+ ],
282
+ parent=Task(
283
+ name="test",
284
+ description="test",
285
+ instruction="do the thing",
286
+ ),
287
+ ),
288
+ )
289
+
290
+ with pytest.raises(
291
+ ValueError, match="All eval configs must have the same parent eval"
292
+ ):
293
+ EvalRunner(
294
+ eval_configs=[mock_eval_config, eval_config],
295
+ run_configs=[mock_run_config],
296
+ eval_run_type="eval_config_eval",
297
+ )
298
+
299
+
300
+ def test_collect_tasks_excludes_already_run_task_run_eval(
301
+ mock_eval_runner, mock_task, data_source, mock_eval_config, mock_run_config
302
+ ):
303
+ """Test that already run tasks are excluded"""
304
+ # Create a task run
305
+ task_run = TaskRun(
306
+ parent=mock_task,
307
+ input="test",
308
+ input_source=data_source,
309
+ tags=["tag1"],
310
+ output=TaskOutput(
311
+ output="test",
312
+ ),
313
+ )
314
+ task_run.save_to_file()
315
+
316
+ # Prior to any eval runs, we should get the task run
317
+ jobs = mock_eval_runner.collect_tasks()
318
+ assert len(jobs) == 1
319
+ assert jobs[0].item.id == task_run.id
320
+ assert jobs[0].task_run_config.id == mock_run_config.id
321
+ assert jobs[0].eval_config.id == mock_eval_config.id
322
+
323
+ # Create an eval run for this task
324
+ EvalRun(
325
+ parent=mock_eval_config,
326
+ dataset_id=task_run.id,
327
+ task_run_config_id=mock_run_config.id,
328
+ input="test",
329
+ output="test",
330
+ scores={"accuracy": 1.0},
331
+ ).save_to_file()
332
+
333
+ # Set filter to match the task
334
+ mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
335
+ mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
336
+
337
+ jobs = mock_eval_runner.collect_tasks()
338
+
339
+ # Should get no jobs since the task was already run
340
+ assert len(jobs) == 0
341
+
342
+
343
+ def test_collect_tasks_excludes_already_run_eval_config_eval(
344
+ mock_task, data_source, mock_eval_config, mock_eval, mock_run_config
345
+ ):
346
+ """Test that already run tasks are excluded"""
347
+ # Create a task run
348
+ task_run = TaskRun(
349
+ parent=mock_task,
350
+ input="test",
351
+ input_source=data_source,
352
+ tags=["tag1"],
353
+ output=TaskOutput(
354
+ output="test",
355
+ ),
356
+ )
357
+ task_run.save_to_file()
358
+
359
+ mock_eval.eval_set_filter_id = "tag::nonexistent"
360
+ mock_eval.eval_configs_filter_id = "tag::tag1"
361
+ mock_eval.save_to_file()
362
+
363
+ # Prior to any eval runs, we should get 1 job for the eval config
364
+ runner = EvalRunner(
365
+ eval_configs=[mock_eval_config],
366
+ run_configs=None,
367
+ eval_run_type="eval_config_eval",
368
+ )
369
+ jobs = runner.collect_tasks()
370
+ assert len(jobs) == 1
371
+ assert jobs[0].item.id == task_run.id
372
+ assert jobs[0].eval_config.id == mock_eval_config.id
373
+ assert jobs[0].task_run_config is None
374
+
375
+ # Create an eval run for this eval config task run pair, so now we should get no jobs (already run)
376
+ EvalRun(
377
+ parent=mock_eval_config,
378
+ dataset_id=task_run.id,
379
+ task_run_config_id=None,
380
+ eval_config_eval=True,
381
+ input="test",
382
+ output="test",
383
+ scores={
384
+ "accuracy": 1.0,
385
+ },
386
+ ).save_to_file()
387
+
388
+ jobs = runner.collect_tasks()
389
+
390
+ # Should get no jobs since the task was already run
391
+ assert len(jobs) == 0
392
+
393
+
394
+ def test_collect_tasks_multiple_run_configs(
395
+ mock_eval_runner, mock_task, data_source, mock_run_config
396
+ ):
397
+ """Test handling multiple run configs"""
398
+ # Create a task run
399
+ task_run = TaskRun(
400
+ parent=mock_task,
401
+ input="test",
402
+ input_source=data_source,
403
+ tags=["tag1"],
404
+ output=TaskOutput(
405
+ output="test",
406
+ ),
407
+ )
408
+ task_run.save_to_file()
409
+
410
+ # Add another run config
411
+ second_config = TaskRunConfig(
412
+ name="test2",
413
+ description="test2",
414
+ run_config_properties=RunConfigProperties(
415
+ model_name="gpt-3.5",
416
+ model_provider_name="openai",
417
+ prompt_id="simple_prompt_builder",
418
+ ),
419
+ parent=mock_task,
420
+ )
421
+ second_config.save_to_file()
422
+ mock_eval_runner.run_configs.append(second_config)
423
+
424
+ # Set filter to match the task
425
+ mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
426
+
427
+ jobs = mock_eval_runner.collect_tasks()
428
+
429
+ # Should get 2 jobs, one for each config
430
+ assert len(jobs) == 2
431
+ assert {job.task_run_config.id for job in jobs} == {
432
+ second_config.id,
433
+ mock_run_config.id,
434
+ }
435
+
436
+
437
+ def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source):
438
+ """Test empty cases - no matching tasks or no tasks at all"""
439
+ # Set filter that won't match anything
440
+ mock_eval_runner.eval.eval_set_filter_id = "tag::nonexistent"
441
+ mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
442
+
443
+ jobs = mock_eval_runner.collect_tasks()
444
+ assert len(jobs) == 0
445
+
446
+ # Create task run with non-matching tag
447
+ task_run = TaskRun(
448
+ parent=mock_task,
449
+ input="test",
450
+ input_source=data_source,
451
+ tags=["other_tag"],
452
+ output=TaskOutput(
453
+ output="test",
454
+ ),
455
+ )
456
+ task_run.save_to_file()
457
+
458
+ jobs = mock_eval_runner.collect_tasks()
459
+ assert len(jobs) == 0
460
+
461
+
462
+ @pytest.mark.asyncio
463
+ async def test_run_job_success_task_run_eval(
464
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
465
+ ):
466
+ # Create a task run to evaluate
467
+ task_run = TaskRun(
468
+ parent=mock_task,
469
+ input="test input",
470
+ input_source=data_source,
471
+ output=TaskOutput(output="test output"),
472
+ )
473
+ task_run.save_to_file()
474
+
475
+ # Create eval job
476
+ job = EvalJob(
477
+ item=task_run,
478
+ task_run_config=mock_run_config,
479
+ type="task_run_eval",
480
+ eval_config=mock_eval_config,
481
+ )
482
+
483
+ # Mock the evaluator
484
+ mock_result_run = TaskRun(
485
+ input="test input",
486
+ input_source=data_source,
487
+ output=TaskOutput(output="evaluated output"),
488
+ intermediate_outputs={"intermediate_output": "intermediate output"},
489
+ )
490
+ mock_scores = {"accuracy": 0.95}
491
+
492
+ class MockEvaluator(BaseEval):
493
+ async def run_task_and_eval(self, input_text):
494
+ return (
495
+ mock_result_run,
496
+ mock_scores,
497
+ {"intermediate_output": "intermediate output"},
498
+ )
499
+
500
+ with patch(
501
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
502
+ return_value=lambda *args: MockEvaluator(*args),
503
+ ):
504
+ success = await mock_eval_runner.run_job(job)
505
+
506
+ assert success is True
507
+
508
+ # Verify eval run was saved
509
+ eval_runs = mock_eval_config.runs()
510
+ assert len(eval_runs) == 1
511
+ saved_run = eval_runs[0]
512
+ assert saved_run.dataset_id == task_run.id
513
+ assert saved_run.task_run_config_id == mock_run_config.id
514
+ assert saved_run.scores == mock_scores
515
+ assert saved_run.input == "test input"
516
+ assert saved_run.output == "evaluated output"
517
+ assert saved_run.intermediate_outputs == {
518
+ "intermediate_output": "intermediate output"
519
+ }
520
+ assert saved_run.parent_eval_config().id == mock_eval_config.id
521
+ assert saved_run.eval_config_eval is False
522
+
523
+
524
+ @pytest.mark.asyncio
525
+ async def test_run_job_success_eval_config_eval(
526
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
527
+ ):
528
+ # Create a task run to evaluate
529
+ task_run = TaskRun(
530
+ parent=mock_task,
531
+ input="test input",
532
+ input_source=data_source,
533
+ output=TaskOutput(output="test output"),
534
+ )
535
+ task_run.save_to_file()
536
+
537
+ # Create eval job
538
+ job = EvalJob(
539
+ item=task_run,
540
+ type="eval_config_eval",
541
+ eval_config=mock_eval_config,
542
+ )
543
+
544
+ # Mock the evaluator
545
+ mock_result_run = TaskRun(
546
+ input="test input",
547
+ input_source=data_source,
548
+ output=TaskOutput(output="evaluated output"),
549
+ )
550
+ mock_scores: EvalScores = {"accuracy": 0.95}
551
+
552
+ class MockEvaluator(BaseEval):
553
+ async def run_task_and_eval(self, input_text):
554
+ raise ValueError("Attempted to run task and eval for a config eval")
555
+
556
+ async def run_eval(
557
+ self, task_run: TaskRun
558
+ ) -> tuple[EvalScores, Dict[str, str] | None]:
559
+ return mock_scores, {"intermediate_output": "intermediate output"}
560
+
561
+ with patch(
562
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
563
+ return_value=lambda *args: MockEvaluator(*args),
564
+ ):
565
+ success = await mock_eval_runner.run_job(job)
566
+
567
+ assert success is True
568
+
569
+ # Verify eval run was saved
570
+ eval_runs = mock_eval_config.runs()
571
+ assert len(eval_runs) == 1
572
+ saved_run = eval_runs[0]
573
+ assert saved_run.dataset_id == task_run.id
574
+ assert saved_run.task_run_config_id is None
575
+ assert saved_run.scores == mock_scores
576
+ assert saved_run.input == "test input"
577
+ assert saved_run.output == "test output"
578
+ assert saved_run.parent_eval_config().id == mock_eval_config.id
579
+ assert saved_run.eval_config_eval is True
580
+
581
+
582
+ @pytest.mark.asyncio
583
+ async def test_run_job_invalid_evaluator(
584
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
585
+ ):
586
+ task_run = TaskRun(
587
+ parent=mock_task,
588
+ input="test input",
589
+ input_source=data_source,
590
+ output=TaskOutput(output="test output"),
591
+ )
592
+ task_run.save_to_file()
593
+ job = EvalJob(
594
+ item=task_run,
595
+ task_run_config=mock_run_config,
596
+ type="task_run_eval",
597
+ eval_config=mock_eval_config,
598
+ )
599
+
600
+ # Return an invalid evaluator type
601
+ with patch(
602
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
603
+ return_value=lambda *args: object(),
604
+ ):
605
+ success = await mock_eval_runner.run_job(job)
606
+
607
+ assert success is False
608
+ assert len(mock_eval_config.runs()) == 0
609
+
610
+
611
+ @pytest.mark.asyncio
612
+ async def test_run_job_evaluator_error(
613
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
614
+ ):
615
+ task_run = TaskRun(
616
+ parent=mock_task,
617
+ input="test input",
618
+ input_source=data_source,
619
+ output=TaskOutput(output="test output"),
620
+ )
621
+ task_run.save_to_file()
622
+ job = EvalJob(
623
+ item=task_run,
624
+ task_run_config=mock_run_config,
625
+ type="task_run_eval",
626
+ eval_config=mock_eval_config,
627
+ )
628
+
629
+ class ErrorEvaluator(BaseEval):
630
+ async def run_task_and_eval(self, input_text):
631
+ raise ValueError("Evaluation failed")
632
+
633
+ with patch(
634
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
635
+ return_value=lambda *args: ErrorEvaluator(*args),
636
+ ):
637
+ success = await mock_eval_runner.run_job(job)
638
+
639
+ assert success is False
640
+ assert len(mock_eval_config.runs()) == 0