kiln-ai 0.11.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (80) hide show
  1. kiln_ai/adapters/__init__.py +4 -0
  2. kiln_ai/adapters/adapter_registry.py +163 -39
  3. kiln_ai/adapters/data_gen/data_gen_task.py +18 -0
  4. kiln_ai/adapters/eval/__init__.py +28 -0
  5. kiln_ai/adapters/eval/base_eval.py +164 -0
  6. kiln_ai/adapters/eval/eval_runner.py +270 -0
  7. kiln_ai/adapters/eval/g_eval.py +368 -0
  8. kiln_ai/adapters/eval/registry.py +16 -0
  9. kiln_ai/adapters/eval/test_base_eval.py +325 -0
  10. kiln_ai/adapters/eval/test_eval_runner.py +641 -0
  11. kiln_ai/adapters/eval/test_g_eval.py +498 -0
  12. kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +16 -2
  14. kiln_ai/adapters/fine_tune/finetune_registry.py +2 -0
  15. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +4 -1
  16. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +1 -1
  17. kiln_ai/adapters/fine_tune/test_openai_finetune.py +1 -1
  18. kiln_ai/adapters/fine_tune/test_together_finetune.py +531 -0
  19. kiln_ai/adapters/fine_tune/together_finetune.py +325 -0
  20. kiln_ai/adapters/ml_model_list.py +758 -163
  21. kiln_ai/adapters/model_adapters/__init__.py +2 -4
  22. kiln_ai/adapters/model_adapters/base_adapter.py +61 -43
  23. kiln_ai/adapters/model_adapters/litellm_adapter.py +391 -0
  24. kiln_ai/adapters/model_adapters/litellm_config.py +13 -0
  25. kiln_ai/adapters/model_adapters/test_base_adapter.py +22 -13
  26. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +407 -0
  27. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -19
  28. kiln_ai/adapters/model_adapters/test_structured_output.py +59 -35
  29. kiln_ai/adapters/ollama_tools.py +3 -3
  30. kiln_ai/adapters/parsers/r1_parser.py +19 -14
  31. kiln_ai/adapters/parsers/test_r1_parser.py +17 -5
  32. kiln_ai/adapters/prompt_builders.py +80 -42
  33. kiln_ai/adapters/provider_tools.py +50 -58
  34. kiln_ai/adapters/repair/repair_task.py +9 -21
  35. kiln_ai/adapters/repair/test_repair_task.py +6 -6
  36. kiln_ai/adapters/run_output.py +3 -0
  37. kiln_ai/adapters/test_adapter_registry.py +26 -29
  38. kiln_ai/adapters/test_generate_docs.py +4 -4
  39. kiln_ai/adapters/test_ollama_tools.py +0 -1
  40. kiln_ai/adapters/test_prompt_adaptors.py +47 -33
  41. kiln_ai/adapters/test_prompt_builders.py +91 -31
  42. kiln_ai/adapters/test_provider_tools.py +26 -81
  43. kiln_ai/datamodel/__init__.py +50 -952
  44. kiln_ai/datamodel/basemodel.py +2 -0
  45. kiln_ai/datamodel/datamodel_enums.py +60 -0
  46. kiln_ai/datamodel/dataset_filters.py +114 -0
  47. kiln_ai/datamodel/dataset_split.py +170 -0
  48. kiln_ai/datamodel/eval.py +298 -0
  49. kiln_ai/datamodel/finetune.py +105 -0
  50. kiln_ai/datamodel/json_schema.py +7 -1
  51. kiln_ai/datamodel/project.py +23 -0
  52. kiln_ai/datamodel/prompt.py +37 -0
  53. kiln_ai/datamodel/prompt_id.py +83 -0
  54. kiln_ai/datamodel/strict_mode.py +24 -0
  55. kiln_ai/datamodel/task.py +181 -0
  56. kiln_ai/datamodel/task_output.py +328 -0
  57. kiln_ai/datamodel/task_run.py +164 -0
  58. kiln_ai/datamodel/test_basemodel.py +19 -11
  59. kiln_ai/datamodel/test_dataset_filters.py +71 -0
  60. kiln_ai/datamodel/test_dataset_split.py +32 -8
  61. kiln_ai/datamodel/test_datasource.py +22 -2
  62. kiln_ai/datamodel/test_eval_model.py +635 -0
  63. kiln_ai/datamodel/test_example_models.py +9 -13
  64. kiln_ai/datamodel/test_json_schema.py +23 -0
  65. kiln_ai/datamodel/test_models.py +2 -2
  66. kiln_ai/datamodel/test_prompt_id.py +129 -0
  67. kiln_ai/datamodel/test_task.py +159 -0
  68. kiln_ai/utils/config.py +43 -1
  69. kiln_ai/utils/dataset_import.py +232 -0
  70. kiln_ai/utils/test_dataset_import.py +596 -0
  71. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/METADATA +86 -6
  72. kiln_ai-0.13.0.dist-info/RECORD +103 -0
  73. kiln_ai/adapters/model_adapters/langchain_adapters.py +0 -302
  74. kiln_ai/adapters/model_adapters/openai_compatible_config.py +0 -11
  75. kiln_ai/adapters/model_adapters/openai_model_adapter.py +0 -246
  76. kiln_ai/adapters/model_adapters/test_langchain_adapter.py +0 -350
  77. kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +0 -225
  78. kiln_ai-0.11.1.dist-info/RECORD +0 -76
  79. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/WHEEL +0 -0
  80. {kiln_ai-0.11.1.dist-info → kiln_ai-0.13.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,641 @@
1
+ from typing import Dict
2
+ from unittest.mock import AsyncMock, patch
3
+
4
+ import pytest
5
+
6
+ from kiln_ai.adapters.eval.base_eval import BaseEval
7
+ from kiln_ai.adapters.eval.eval_runner import EvalJob, EvalRunner
8
+ from kiln_ai.datamodel import (
9
+ DataSource,
10
+ DataSourceType,
11
+ Task,
12
+ TaskOutput,
13
+ TaskOutputRatingType,
14
+ TaskRun,
15
+ )
16
+ from kiln_ai.datamodel.eval import (
17
+ Eval,
18
+ EvalConfig,
19
+ EvalOutputScore,
20
+ EvalRun,
21
+ EvalScores,
22
+ )
23
+ from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
24
+
25
+
26
+ @pytest.fixture
27
+ def mock_task(tmp_path):
28
+ task = Task(
29
+ name="test",
30
+ description="test",
31
+ instruction="do the thing",
32
+ path=tmp_path / "task.kiln",
33
+ )
34
+ task.save_to_file()
35
+ return task
36
+
37
+
38
+ @pytest.fixture
39
+ def mock_eval(mock_task):
40
+ eval = Eval(
41
+ id="test",
42
+ name="test",
43
+ description="test",
44
+ eval_set_filter_id="all",
45
+ eval_configs_filter_id="all",
46
+ output_scores=[
47
+ EvalOutputScore(
48
+ name="Accuracy",
49
+ instruction="Check if the output is accurate",
50
+ type=TaskOutputRatingType.pass_fail,
51
+ ),
52
+ ],
53
+ parent=mock_task,
54
+ )
55
+ eval.save_to_file()
56
+ return eval
57
+
58
+
59
+ @pytest.fixture
60
+ def data_source():
61
+ return DataSource(
62
+ type=DataSourceType.synthetic,
63
+ properties={
64
+ "model_name": "gpt-4",
65
+ "model_provider": "openai",
66
+ "adapter_name": "test_adapter",
67
+ },
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def mock_eval_config(mock_eval):
73
+ eval_config = EvalConfig(
74
+ name="test",
75
+ model_name="gpt-4",
76
+ model_provider="openai",
77
+ parent=mock_eval,
78
+ properties={
79
+ "eval_steps": ["step1", "step2", "step3"],
80
+ },
81
+ )
82
+ eval_config.save_to_file()
83
+ return eval_config
84
+
85
+
86
+ @pytest.fixture
87
+ def mock_run_config(
88
+ mock_task,
89
+ ):
90
+ rc = TaskRunConfig(
91
+ name="test",
92
+ description="test",
93
+ run_config_properties=RunConfigProperties(
94
+ model_name="gpt-4",
95
+ model_provider_name="openai",
96
+ prompt_id="simple_prompt_builder",
97
+ ),
98
+ parent=mock_task,
99
+ )
100
+ rc.save_to_file()
101
+ return rc
102
+
103
+
104
+ @pytest.fixture
105
+ def mock_eval_runner(mock_eval, mock_task, mock_eval_config, mock_run_config):
106
+ return EvalRunner(
107
+ eval_configs=[mock_eval_config],
108
+ run_configs=[mock_run_config],
109
+ eval_run_type="task_run_eval",
110
+ )
111
+
112
+
113
+ # Test with and without concurrency
114
+ @pytest.mark.parametrize("concurrency", [1, 25])
115
+ @pytest.mark.asyncio
116
+ async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency):
117
+ # Real async testing!
118
+
119
+ job_count = 50
120
+ # Job objects are not the right type, but since we're mocking run_job, it doesn't matter
121
+ jobs = [{} for _ in range(job_count)]
122
+
123
+ # Mock collect_tasks to return our fake jobs
124
+ mock_eval_runner.collect_tasks = lambda: jobs
125
+
126
+ # Mock run_job to return True immediately
127
+ mock_eval_runner.run_job = AsyncMock(return_value=True)
128
+
129
+ # Expect the status updates in order, and 1 for each job
130
+ expected_compelted_count = 0
131
+ async for progress in mock_eval_runner.run(concurrency=concurrency):
132
+ assert progress.complete == expected_compelted_count
133
+ expected_compelted_count += 1
134
+ assert progress.errors == 0
135
+ assert progress.total == job_count
136
+
137
+ # Verify last status update was complete
138
+ assert expected_compelted_count == job_count + 1
139
+
140
+ # Verify run_job was called for each job
141
+ assert mock_eval_runner.run_job.call_count == job_count
142
+
143
+
144
+ def test_collect_tasks_filtering(
145
+ mock_eval,
146
+ mock_eval_runner,
147
+ mock_task,
148
+ mock_eval_config,
149
+ data_source,
150
+ mock_run_config,
151
+ ):
152
+ """Test that tasks are properly filtered based on eval filters"""
153
+ tags = ["tag1", "tag2", "tag3"]
154
+ task_runs = []
155
+ for tag in tags:
156
+ # Create some task runs with different tags
157
+ task_run = TaskRun(
158
+ parent=mock_task,
159
+ input="test1",
160
+ input_source=data_source,
161
+ output=TaskOutput(
162
+ output="test1",
163
+ ),
164
+ tags=[tag],
165
+ )
166
+ task_run.save_to_file()
167
+ task_runs.append(task_run)
168
+
169
+ mock_eval.eval_set_filter_id = "tag::tag1"
170
+ mock_eval.eval_configs_filter_id = "tag::tag2"
171
+
172
+ # Create a new runner of type task run eval
173
+ runner = EvalRunner(
174
+ eval_configs=[mock_eval_config],
175
+ run_configs=[mock_run_config],
176
+ eval_run_type="task_run_eval",
177
+ )
178
+ jobs = runner.collect_tasks()
179
+
180
+ # Should only get task_run1 jobs, the one with tag1
181
+ assert len(jobs) == 1
182
+ job = jobs[0]
183
+ # job should be the tag1 item, and setup as a task run eval for mock_run_config
184
+ assert job.item.tags == ["tag1"]
185
+ assert job.task_run_config.id == mock_run_config.id
186
+ assert job.eval_config.id == mock_eval_config.id
187
+
188
+ # Change to an eval config set filter
189
+ runner = EvalRunner(
190
+ eval_configs=[mock_eval_config],
191
+ run_configs=None,
192
+ eval_run_type="eval_config_eval",
193
+ )
194
+ jobs = runner.collect_tasks()
195
+
196
+ # Should only get eval_config1 jobs
197
+ assert len(jobs) == 1
198
+ job = jobs[0]
199
+ # job should be the tag2 item, and setup as a eval config eval for mock_eval_config
200
+ assert job.item.tags == ["tag2"]
201
+ assert job.eval_config.id == mock_eval_config.id
202
+ assert job.task_run_config is None
203
+
204
+ # Add a second task run config, and call a new runner with multiple run configs
205
+ rc = TaskRunConfig(
206
+ name="test2",
207
+ description="test2",
208
+ run_config_properties=RunConfigProperties(
209
+ model_name="gpt-4",
210
+ model_provider_name="openai",
211
+ prompt_id="simple_prompt_builder",
212
+ ),
213
+ parent=mock_task,
214
+ )
215
+ rc.save_to_file()
216
+ runner = EvalRunner(
217
+ eval_configs=[mock_eval_config],
218
+ run_configs=[mock_run_config, rc],
219
+ eval_run_type="task_run_eval",
220
+ )
221
+ jobs = runner.collect_tasks()
222
+ assert len(jobs) == 2
223
+ for job in jobs:
224
+ assert job.item.tags == ["tag1"]
225
+ assert job.task_run_config.id in [mock_run_config.id, rc.id]
226
+ assert job.eval_config.id == mock_eval_config.id
227
+ assert jobs[0].task_run_config.id != jobs[1].task_run_config.id
228
+
229
+ # add a second eval config, and call a new runner with multiple eval configs
230
+ eval_config = EvalConfig(
231
+ name="test2",
232
+ model_name="gpt-4",
233
+ model_provider="openai",
234
+ parent=mock_eval,
235
+ properties={
236
+ "eval_steps": ["step1", "step2", "step3"],
237
+ },
238
+ )
239
+ eval_config.save_to_file()
240
+ runner = EvalRunner(
241
+ eval_configs=[mock_eval_config, eval_config],
242
+ run_configs=None,
243
+ eval_run_type="eval_config_eval",
244
+ )
245
+ jobs = runner.collect_tasks()
246
+ # Check we get 2 jobs, one for each eval config
247
+ assert len(jobs) == 2
248
+ for job in jobs:
249
+ assert job.item.tags == ["tag2"]
250
+ assert job.eval_config.id in [mock_eval_config.id, eval_config.id]
251
+ assert job.task_run_config is None
252
+ assert jobs[0].eval_config.id != jobs[1].eval_config.id
253
+
254
+
255
+ def test_validate_same_task(
256
+ mock_eval_runner,
257
+ mock_task,
258
+ data_source,
259
+ tmp_path,
260
+ mock_eval_config,
261
+ mock_run_config,
262
+ ):
263
+ # second eval config has a different task
264
+ eval_config = EvalConfig(
265
+ name="test2",
266
+ model_name="gpt-4",
267
+ model_provider="openai",
268
+ properties={
269
+ "eval_steps": ["step1", "step2", "step3"],
270
+ },
271
+ parent=Eval(
272
+ name="test",
273
+ description="test",
274
+ eval_set_filter_id="all",
275
+ eval_configs_filter_id="all",
276
+ output_scores=[
277
+ EvalOutputScore(
278
+ name="Accuracy",
279
+ instruction="Check if the output is accurate",
280
+ type=TaskOutputRatingType.pass_fail,
281
+ ),
282
+ ],
283
+ parent=Task(
284
+ name="test",
285
+ description="test",
286
+ instruction="do the thing",
287
+ ),
288
+ ),
289
+ )
290
+
291
+ with pytest.raises(
292
+ ValueError, match="All eval configs must have the same parent eval"
293
+ ):
294
+ EvalRunner(
295
+ eval_configs=[mock_eval_config, eval_config],
296
+ run_configs=[mock_run_config],
297
+ eval_run_type="eval_config_eval",
298
+ )
299
+
300
+
301
+ def test_collect_tasks_excludes_already_run_task_run_eval(
302
+ mock_eval_runner, mock_task, data_source, mock_eval_config, mock_run_config
303
+ ):
304
+ """Test that already run tasks are excluded"""
305
+ # Create a task run
306
+ task_run = TaskRun(
307
+ parent=mock_task,
308
+ input="test",
309
+ input_source=data_source,
310
+ tags=["tag1"],
311
+ output=TaskOutput(
312
+ output="test",
313
+ ),
314
+ )
315
+ task_run.save_to_file()
316
+
317
+ # Prior to any eval runs, we should get the task run
318
+ jobs = mock_eval_runner.collect_tasks()
319
+ assert len(jobs) == 1
320
+ assert jobs[0].item.id == task_run.id
321
+ assert jobs[0].task_run_config.id == mock_run_config.id
322
+ assert jobs[0].eval_config.id == mock_eval_config.id
323
+
324
+ # Create an eval run for this task
325
+ EvalRun(
326
+ parent=mock_eval_config,
327
+ dataset_id=task_run.id,
328
+ task_run_config_id=mock_run_config.id,
329
+ input="test",
330
+ output="test",
331
+ scores={"accuracy": 1.0},
332
+ ).save_to_file()
333
+
334
+ # Set filter to match the task
335
+ mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
336
+ mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
337
+
338
+ jobs = mock_eval_runner.collect_tasks()
339
+
340
+ # Should get no jobs since the task was already run
341
+ assert len(jobs) == 0
342
+
343
+
344
+ def test_collect_tasks_excludes_already_run_eval_config_eval(
345
+ mock_task, data_source, mock_eval_config, mock_eval, mock_run_config
346
+ ):
347
+ """Test that already run tasks are excluded"""
348
+ # Create a task run
349
+ task_run = TaskRun(
350
+ parent=mock_task,
351
+ input="test",
352
+ input_source=data_source,
353
+ tags=["tag1"],
354
+ output=TaskOutput(
355
+ output="test",
356
+ ),
357
+ )
358
+ task_run.save_to_file()
359
+
360
+ mock_eval.eval_set_filter_id = "tag::nonexistent"
361
+ mock_eval.eval_configs_filter_id = "tag::tag1"
362
+ mock_eval.save_to_file()
363
+
364
+ # Prior to any eval runs, we should get 1 job for the eval config
365
+ runner = EvalRunner(
366
+ eval_configs=[mock_eval_config],
367
+ run_configs=None,
368
+ eval_run_type="eval_config_eval",
369
+ )
370
+ jobs = runner.collect_tasks()
371
+ assert len(jobs) == 1
372
+ assert jobs[0].item.id == task_run.id
373
+ assert jobs[0].eval_config.id == mock_eval_config.id
374
+ assert jobs[0].task_run_config is None
375
+
376
+ # Create an eval run for this eval config task run pair, so now we should get no jobs (already run)
377
+ EvalRun(
378
+ parent=mock_eval_config,
379
+ dataset_id=task_run.id,
380
+ task_run_config_id=None,
381
+ eval_config_eval=True,
382
+ input="test",
383
+ output="test",
384
+ scores={
385
+ "accuracy": 1.0,
386
+ },
387
+ ).save_to_file()
388
+
389
+ jobs = runner.collect_tasks()
390
+
391
+ # Should get no jobs since the task was already run
392
+ assert len(jobs) == 0
393
+
394
+
395
+ def test_collect_tasks_multiple_run_configs(
396
+ mock_eval_runner, mock_task, data_source, mock_run_config
397
+ ):
398
+ """Test handling multiple run configs"""
399
+ # Create a task run
400
+ task_run = TaskRun(
401
+ parent=mock_task,
402
+ input="test",
403
+ input_source=data_source,
404
+ tags=["tag1"],
405
+ output=TaskOutput(
406
+ output="test",
407
+ ),
408
+ )
409
+ task_run.save_to_file()
410
+
411
+ # Add another run config
412
+ second_config = TaskRunConfig(
413
+ name="test2",
414
+ description="test2",
415
+ run_config_properties=RunConfigProperties(
416
+ model_name="gpt-3.5",
417
+ model_provider_name="openai",
418
+ prompt_id="simple_prompt_builder",
419
+ ),
420
+ parent=mock_task,
421
+ )
422
+ second_config.save_to_file()
423
+ mock_eval_runner.run_configs.append(second_config)
424
+
425
+ # Set filter to match the task
426
+ mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
427
+
428
+ jobs = mock_eval_runner.collect_tasks()
429
+
430
+ # Should get 2 jobs, one for each config
431
+ assert len(jobs) == 2
432
+ assert {job.task_run_config.id for job in jobs} == {
433
+ second_config.id,
434
+ mock_run_config.id,
435
+ }
436
+
437
+
438
+ def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source):
439
+ """Test empty cases - no matching tasks or no tasks at all"""
440
+ # Set filter that won't match anything
441
+ mock_eval_runner.eval.eval_set_filter_id = "tag::nonexistent"
442
+ mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
443
+
444
+ jobs = mock_eval_runner.collect_tasks()
445
+ assert len(jobs) == 0
446
+
447
+ # Create task run with non-matching tag
448
+ task_run = TaskRun(
449
+ parent=mock_task,
450
+ input="test",
451
+ input_source=data_source,
452
+ tags=["other_tag"],
453
+ output=TaskOutput(
454
+ output="test",
455
+ ),
456
+ )
457
+ task_run.save_to_file()
458
+
459
+ jobs = mock_eval_runner.collect_tasks()
460
+ assert len(jobs) == 0
461
+
462
+
463
+ @pytest.mark.asyncio
464
+ async def test_run_job_success_task_run_eval(
465
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
466
+ ):
467
+ # Create a task run to evaluate
468
+ task_run = TaskRun(
469
+ parent=mock_task,
470
+ input="test input",
471
+ input_source=data_source,
472
+ output=TaskOutput(output="test output"),
473
+ )
474
+ task_run.save_to_file()
475
+
476
+ # Create eval job
477
+ job = EvalJob(
478
+ item=task_run,
479
+ task_run_config=mock_run_config,
480
+ type="task_run_eval",
481
+ eval_config=mock_eval_config,
482
+ )
483
+
484
+ # Mock the evaluator
485
+ mock_result_run = TaskRun(
486
+ input="test input",
487
+ input_source=data_source,
488
+ output=TaskOutput(output="evaluated output"),
489
+ intermediate_outputs={"intermediate_output": "intermediate output"},
490
+ )
491
+ mock_scores = {"accuracy": 0.95}
492
+
493
+ class MockEvaluator(BaseEval):
494
+ async def run_task_and_eval(self, input_text):
495
+ return (
496
+ mock_result_run,
497
+ mock_scores,
498
+ {"intermediate_output": "intermediate output"},
499
+ )
500
+
501
+ with patch(
502
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
503
+ return_value=lambda *args: MockEvaluator(*args),
504
+ ):
505
+ success = await mock_eval_runner.run_job(job)
506
+
507
+ assert success is True
508
+
509
+ # Verify eval run was saved
510
+ eval_runs = mock_eval_config.runs()
511
+ assert len(eval_runs) == 1
512
+ saved_run = eval_runs[0]
513
+ assert saved_run.dataset_id == task_run.id
514
+ assert saved_run.task_run_config_id == mock_run_config.id
515
+ assert saved_run.scores == mock_scores
516
+ assert saved_run.input == "test input"
517
+ assert saved_run.output == "evaluated output"
518
+ assert saved_run.intermediate_outputs == {
519
+ "intermediate_output": "intermediate output"
520
+ }
521
+ assert saved_run.parent_eval_config().id == mock_eval_config.id
522
+ assert saved_run.eval_config_eval is False
523
+
524
+
525
+ @pytest.mark.asyncio
526
+ async def test_run_job_success_eval_config_eval(
527
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
528
+ ):
529
+ # Create a task run to evaluate
530
+ task_run = TaskRun(
531
+ parent=mock_task,
532
+ input="test input",
533
+ input_source=data_source,
534
+ output=TaskOutput(output="test output"),
535
+ )
536
+ task_run.save_to_file()
537
+
538
+ # Create eval job
539
+ job = EvalJob(
540
+ item=task_run,
541
+ type="eval_config_eval",
542
+ eval_config=mock_eval_config,
543
+ )
544
+
545
+ # Mock the evaluator
546
+ mock_result_run = TaskRun(
547
+ input="test input",
548
+ input_source=data_source,
549
+ output=TaskOutput(output="evaluated output"),
550
+ )
551
+ mock_scores: EvalScores = {"accuracy": 0.95}
552
+
553
+ class MockEvaluator(BaseEval):
554
+ async def run_task_and_eval(self, input_text):
555
+ raise ValueError("Attempted to run task and eval for a config eval")
556
+
557
+ async def run_eval(
558
+ self, task_run: TaskRun
559
+ ) -> tuple[EvalScores, Dict[str, str] | None]:
560
+ return mock_scores, {"intermediate_output": "intermediate output"}
561
+
562
+ with patch(
563
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
564
+ return_value=lambda *args: MockEvaluator(*args),
565
+ ):
566
+ success = await mock_eval_runner.run_job(job)
567
+
568
+ assert success is True
569
+
570
+ # Verify eval run was saved
571
+ eval_runs = mock_eval_config.runs()
572
+ assert len(eval_runs) == 1
573
+ saved_run = eval_runs[0]
574
+ assert saved_run.dataset_id == task_run.id
575
+ assert saved_run.task_run_config_id is None
576
+ assert saved_run.scores == mock_scores
577
+ assert saved_run.input == "test input"
578
+ assert saved_run.output == "test output"
579
+ assert saved_run.parent_eval_config().id == mock_eval_config.id
580
+ assert saved_run.eval_config_eval is True
581
+
582
+
583
+ @pytest.mark.asyncio
584
+ async def test_run_job_invalid_evaluator(
585
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
586
+ ):
587
+ task_run = TaskRun(
588
+ parent=mock_task,
589
+ input="test input",
590
+ input_source=data_source,
591
+ output=TaskOutput(output="test output"),
592
+ )
593
+ task_run.save_to_file()
594
+ job = EvalJob(
595
+ item=task_run,
596
+ task_run_config=mock_run_config,
597
+ type="task_run_eval",
598
+ eval_config=mock_eval_config,
599
+ )
600
+
601
+ # Return an invalid evaluator type
602
+ with patch(
603
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
604
+ return_value=lambda *args: object(),
605
+ ):
606
+ success = await mock_eval_runner.run_job(job)
607
+
608
+ assert success is False
609
+ assert len(mock_eval_config.runs()) == 0
610
+
611
+
612
+ @pytest.mark.asyncio
613
+ async def test_run_job_evaluator_error(
614
+ mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
615
+ ):
616
+ task_run = TaskRun(
617
+ parent=mock_task,
618
+ input="test input",
619
+ input_source=data_source,
620
+ output=TaskOutput(output="test output"),
621
+ )
622
+ task_run.save_to_file()
623
+ job = EvalJob(
624
+ item=task_run,
625
+ task_run_config=mock_run_config,
626
+ type="task_run_eval",
627
+ eval_config=mock_eval_config,
628
+ )
629
+
630
+ class ErrorEvaluator(BaseEval):
631
+ async def run_task_and_eval(self, input_text):
632
+ raise ValueError("Evaluation failed")
633
+
634
+ with patch(
635
+ "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
636
+ return_value=lambda *args: ErrorEvaluator(*args),
637
+ ):
638
+ success = await mock_eval_runner.run_job(job)
639
+
640
+ assert success is False
641
+ assert len(mock_eval_config.runs()) == 0