kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +7 -7
- kiln_ai/adapters/adapter_registry.py +81 -10
- kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
- kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
- kiln_ai/adapters/eval/base_eval.py +164 -0
- kiln_ai/adapters/eval/eval_runner.py +267 -0
- kiln_ai/adapters/eval/g_eval.py +367 -0
- kiln_ai/adapters/eval/registry.py +16 -0
- kiln_ai/adapters/eval/test_base_eval.py +324 -0
- kiln_ai/adapters/eval/test_eval_runner.py +640 -0
- kiln_ai/adapters/eval/test_g_eval.py +497 -0
- kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
- kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
- kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
- kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
- kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
- kiln_ai/adapters/ml_model_list.py +434 -93
- kiln_ai/adapters/model_adapters/__init__.py +18 -0
- kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
- kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
- kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
- kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
- kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
- kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
- kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
- kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
- kiln_ai/adapters/ollama_tools.py +0 -1
- kiln_ai/adapters/parsers/__init__.py +10 -0
- kiln_ai/adapters/parsers/base_parser.py +12 -0
- kiln_ai/adapters/parsers/json_parser.py +37 -0
- kiln_ai/adapters/parsers/parser_registry.py +19 -0
- kiln_ai/adapters/parsers/r1_parser.py +69 -0
- kiln_ai/adapters/parsers/test_json_parser.py +81 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
- kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
- kiln_ai/adapters/prompt_builders.py +193 -49
- kiln_ai/adapters/provider_tools.py +91 -36
- kiln_ai/adapters/repair/repair_task.py +18 -19
- kiln_ai/adapters/repair/test_repair_task.py +7 -7
- kiln_ai/adapters/run_output.py +11 -0
- kiln_ai/adapters/test_adapter_registry.py +177 -0
- kiln_ai/adapters/test_generate_docs.py +69 -0
- kiln_ai/adapters/test_ollama_tools.py +0 -1
- kiln_ai/adapters/test_prompt_adaptors.py +25 -18
- kiln_ai/adapters/test_prompt_builders.py +265 -44
- kiln_ai/adapters/test_provider_tools.py +268 -46
- kiln_ai/datamodel/__init__.py +51 -772
- kiln_ai/datamodel/basemodel.py +31 -11
- kiln_ai/datamodel/datamodel_enums.py +58 -0
- kiln_ai/datamodel/dataset_filters.py +114 -0
- kiln_ai/datamodel/dataset_split.py +170 -0
- kiln_ai/datamodel/eval.py +298 -0
- kiln_ai/datamodel/finetune.py +105 -0
- kiln_ai/datamodel/json_schema.py +14 -3
- kiln_ai/datamodel/model_cache.py +8 -3
- kiln_ai/datamodel/project.py +23 -0
- kiln_ai/datamodel/prompt.py +37 -0
- kiln_ai/datamodel/prompt_id.py +83 -0
- kiln_ai/datamodel/strict_mode.py +24 -0
- kiln_ai/datamodel/task.py +181 -0
- kiln_ai/datamodel/task_output.py +321 -0
- kiln_ai/datamodel/task_run.py +164 -0
- kiln_ai/datamodel/test_basemodel.py +80 -2
- kiln_ai/datamodel/test_dataset_filters.py +71 -0
- kiln_ai/datamodel/test_dataset_split.py +127 -6
- kiln_ai/datamodel/test_datasource.py +3 -2
- kiln_ai/datamodel/test_eval_model.py +635 -0
- kiln_ai/datamodel/test_example_models.py +34 -17
- kiln_ai/datamodel/test_json_schema.py +23 -0
- kiln_ai/datamodel/test_model_cache.py +24 -0
- kiln_ai/datamodel/test_model_perf.py +125 -0
- kiln_ai/datamodel/test_models.py +131 -2
- kiln_ai/datamodel/test_prompt_id.py +129 -0
- kiln_ai/datamodel/test_task.py +159 -0
- kiln_ai/utils/config.py +6 -1
- kiln_ai/utils/exhaustive_error.py +6 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
- kiln_ai-0.12.0.dist-info/RECORD +100 -0
- kiln_ai/adapters/base_adapter.py +0 -191
- kiln_ai/adapters/langchain_adapters.py +0 -256
- kiln_ai-0.8.1.dist-info/RECORD +0 -58
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
from unittest.mock import AsyncMock, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
6
|
+
from kiln_ai.adapters.eval.eval_runner import EvalJob, EvalRunner
|
|
7
|
+
from kiln_ai.datamodel import (
|
|
8
|
+
DataSource,
|
|
9
|
+
DataSourceType,
|
|
10
|
+
Task,
|
|
11
|
+
TaskOutput,
|
|
12
|
+
TaskOutputRatingType,
|
|
13
|
+
TaskRun,
|
|
14
|
+
)
|
|
15
|
+
from kiln_ai.datamodel.eval import (
|
|
16
|
+
Eval,
|
|
17
|
+
EvalConfig,
|
|
18
|
+
EvalOutputScore,
|
|
19
|
+
EvalRun,
|
|
20
|
+
EvalScores,
|
|
21
|
+
)
|
|
22
|
+
from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def mock_task(tmp_path):
|
|
27
|
+
task = Task(
|
|
28
|
+
name="test",
|
|
29
|
+
description="test",
|
|
30
|
+
instruction="do the thing",
|
|
31
|
+
path=tmp_path / "task.kiln",
|
|
32
|
+
)
|
|
33
|
+
task.save_to_file()
|
|
34
|
+
return task
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture
|
|
38
|
+
def mock_eval(mock_task):
|
|
39
|
+
eval = Eval(
|
|
40
|
+
id="test",
|
|
41
|
+
name="test",
|
|
42
|
+
description="test",
|
|
43
|
+
eval_set_filter_id="all",
|
|
44
|
+
eval_configs_filter_id="all",
|
|
45
|
+
output_scores=[
|
|
46
|
+
EvalOutputScore(
|
|
47
|
+
name="Accuracy",
|
|
48
|
+
instruction="Check if the output is accurate",
|
|
49
|
+
type=TaskOutputRatingType.pass_fail,
|
|
50
|
+
),
|
|
51
|
+
],
|
|
52
|
+
parent=mock_task,
|
|
53
|
+
)
|
|
54
|
+
eval.save_to_file()
|
|
55
|
+
return eval
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@pytest.fixture
|
|
59
|
+
def data_source():
|
|
60
|
+
return DataSource(
|
|
61
|
+
type=DataSourceType.synthetic,
|
|
62
|
+
properties={
|
|
63
|
+
"model_name": "gpt-4",
|
|
64
|
+
"model_provider": "openai",
|
|
65
|
+
"adapter_name": "test_adapter",
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.fixture
|
|
71
|
+
def mock_eval_config(mock_eval):
|
|
72
|
+
eval_config = EvalConfig(
|
|
73
|
+
name="test",
|
|
74
|
+
model_name="gpt-4",
|
|
75
|
+
model_provider="openai",
|
|
76
|
+
parent=mock_eval,
|
|
77
|
+
properties={
|
|
78
|
+
"eval_steps": ["step1", "step2", "step3"],
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
eval_config.save_to_file()
|
|
82
|
+
return eval_config
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@pytest.fixture
|
|
86
|
+
def mock_run_config(
|
|
87
|
+
mock_task,
|
|
88
|
+
):
|
|
89
|
+
rc = TaskRunConfig(
|
|
90
|
+
name="test",
|
|
91
|
+
description="test",
|
|
92
|
+
run_config_properties=RunConfigProperties(
|
|
93
|
+
model_name="gpt-4",
|
|
94
|
+
model_provider_name="openai",
|
|
95
|
+
prompt_id="simple_prompt_builder",
|
|
96
|
+
),
|
|
97
|
+
parent=mock_task,
|
|
98
|
+
)
|
|
99
|
+
rc.save_to_file()
|
|
100
|
+
return rc
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@pytest.fixture
|
|
104
|
+
def mock_eval_runner(mock_eval, mock_task, mock_eval_config, mock_run_config):
|
|
105
|
+
return EvalRunner(
|
|
106
|
+
eval_configs=[mock_eval_config],
|
|
107
|
+
run_configs=[mock_run_config],
|
|
108
|
+
eval_run_type="task_run_eval",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# Test with and without concurrency
|
|
113
|
+
@pytest.mark.parametrize("concurrency", [1, 25])
|
|
114
|
+
@pytest.mark.asyncio
|
|
115
|
+
async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency):
|
|
116
|
+
# Real async testing!
|
|
117
|
+
|
|
118
|
+
job_count = 50
|
|
119
|
+
# Job objects are not the right type, but since we're mocking run_job, it doesn't matter
|
|
120
|
+
jobs = [{} for _ in range(job_count)]
|
|
121
|
+
|
|
122
|
+
# Mock collect_tasks to return our fake jobs
|
|
123
|
+
mock_eval_runner.collect_tasks = lambda: jobs
|
|
124
|
+
|
|
125
|
+
# Mock run_job to return True immediately
|
|
126
|
+
mock_eval_runner.run_job = AsyncMock(return_value=True)
|
|
127
|
+
|
|
128
|
+
# Expect the status updates in order, and 1 for each job
|
|
129
|
+
expected_compelted_count = 0
|
|
130
|
+
async for progress in mock_eval_runner.run(concurrency=concurrency):
|
|
131
|
+
assert progress.complete == expected_compelted_count
|
|
132
|
+
expected_compelted_count += 1
|
|
133
|
+
assert progress.errors == 0
|
|
134
|
+
assert progress.total == job_count
|
|
135
|
+
|
|
136
|
+
# Verify last status update was complete
|
|
137
|
+
assert expected_compelted_count == job_count + 1
|
|
138
|
+
|
|
139
|
+
# Verify run_job was called for each job
|
|
140
|
+
assert mock_eval_runner.run_job.call_count == job_count
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_collect_tasks_filtering(
|
|
144
|
+
mock_eval,
|
|
145
|
+
mock_eval_runner,
|
|
146
|
+
mock_task,
|
|
147
|
+
mock_eval_config,
|
|
148
|
+
data_source,
|
|
149
|
+
mock_run_config,
|
|
150
|
+
):
|
|
151
|
+
"""Test that tasks are properly filtered based on eval filters"""
|
|
152
|
+
tags = ["tag1", "tag2", "tag3"]
|
|
153
|
+
task_runs = []
|
|
154
|
+
for tag in tags:
|
|
155
|
+
# Create some task runs with different tags
|
|
156
|
+
task_run = TaskRun(
|
|
157
|
+
parent=mock_task,
|
|
158
|
+
input="test1",
|
|
159
|
+
input_source=data_source,
|
|
160
|
+
output=TaskOutput(
|
|
161
|
+
output="test1",
|
|
162
|
+
),
|
|
163
|
+
tags=[tag],
|
|
164
|
+
)
|
|
165
|
+
task_run.save_to_file()
|
|
166
|
+
task_runs.append(task_run)
|
|
167
|
+
|
|
168
|
+
mock_eval.eval_set_filter_id = "tag::tag1"
|
|
169
|
+
mock_eval.eval_configs_filter_id = "tag::tag2"
|
|
170
|
+
|
|
171
|
+
# Create a new runner of type task run eval
|
|
172
|
+
runner = EvalRunner(
|
|
173
|
+
eval_configs=[mock_eval_config],
|
|
174
|
+
run_configs=[mock_run_config],
|
|
175
|
+
eval_run_type="task_run_eval",
|
|
176
|
+
)
|
|
177
|
+
jobs = runner.collect_tasks()
|
|
178
|
+
|
|
179
|
+
# Should only get task_run1 jobs, the one with tag1
|
|
180
|
+
assert len(jobs) == 1
|
|
181
|
+
job = jobs[0]
|
|
182
|
+
# job should be the tag1 item, and setup as a task run eval for mock_run_config
|
|
183
|
+
assert job.item.tags == ["tag1"]
|
|
184
|
+
assert job.task_run_config.id == mock_run_config.id
|
|
185
|
+
assert job.eval_config.id == mock_eval_config.id
|
|
186
|
+
|
|
187
|
+
# Change to an eval config set filter
|
|
188
|
+
runner = EvalRunner(
|
|
189
|
+
eval_configs=[mock_eval_config],
|
|
190
|
+
run_configs=None,
|
|
191
|
+
eval_run_type="eval_config_eval",
|
|
192
|
+
)
|
|
193
|
+
jobs = runner.collect_tasks()
|
|
194
|
+
|
|
195
|
+
# Should only get eval_config1 jobs
|
|
196
|
+
assert len(jobs) == 1
|
|
197
|
+
job = jobs[0]
|
|
198
|
+
# job should be the tag2 item, and setup as a eval config eval for mock_eval_config
|
|
199
|
+
assert job.item.tags == ["tag2"]
|
|
200
|
+
assert job.eval_config.id == mock_eval_config.id
|
|
201
|
+
assert job.task_run_config is None
|
|
202
|
+
|
|
203
|
+
# Add a second task run config, and call a new runner with multiple run configs
|
|
204
|
+
rc = TaskRunConfig(
|
|
205
|
+
name="test2",
|
|
206
|
+
description="test2",
|
|
207
|
+
run_config_properties=RunConfigProperties(
|
|
208
|
+
model_name="gpt-4",
|
|
209
|
+
model_provider_name="openai",
|
|
210
|
+
prompt_id="simple_prompt_builder",
|
|
211
|
+
),
|
|
212
|
+
parent=mock_task,
|
|
213
|
+
)
|
|
214
|
+
rc.save_to_file()
|
|
215
|
+
runner = EvalRunner(
|
|
216
|
+
eval_configs=[mock_eval_config],
|
|
217
|
+
run_configs=[mock_run_config, rc],
|
|
218
|
+
eval_run_type="task_run_eval",
|
|
219
|
+
)
|
|
220
|
+
jobs = runner.collect_tasks()
|
|
221
|
+
assert len(jobs) == 2
|
|
222
|
+
for job in jobs:
|
|
223
|
+
assert job.item.tags == ["tag1"]
|
|
224
|
+
assert job.task_run_config.id in [mock_run_config.id, rc.id]
|
|
225
|
+
assert job.eval_config.id == mock_eval_config.id
|
|
226
|
+
assert jobs[0].task_run_config.id != jobs[1].task_run_config.id
|
|
227
|
+
|
|
228
|
+
# add a second eval config, and call a new runner with multiple eval configs
|
|
229
|
+
eval_config = EvalConfig(
|
|
230
|
+
name="test2",
|
|
231
|
+
model_name="gpt-4",
|
|
232
|
+
model_provider="openai",
|
|
233
|
+
parent=mock_eval,
|
|
234
|
+
properties={
|
|
235
|
+
"eval_steps": ["step1", "step2", "step3"],
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
eval_config.save_to_file()
|
|
239
|
+
runner = EvalRunner(
|
|
240
|
+
eval_configs=[mock_eval_config, eval_config],
|
|
241
|
+
run_configs=None,
|
|
242
|
+
eval_run_type="eval_config_eval",
|
|
243
|
+
)
|
|
244
|
+
jobs = runner.collect_tasks()
|
|
245
|
+
# Check we get 2 jobs, one for each eval config
|
|
246
|
+
assert len(jobs) == 2
|
|
247
|
+
for job in jobs:
|
|
248
|
+
assert job.item.tags == ["tag2"]
|
|
249
|
+
assert job.eval_config.id in [mock_eval_config.id, eval_config.id]
|
|
250
|
+
assert job.task_run_config is None
|
|
251
|
+
assert jobs[0].eval_config.id != jobs[1].eval_config.id
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def test_validate_same_task(
|
|
255
|
+
mock_eval_runner,
|
|
256
|
+
mock_task,
|
|
257
|
+
data_source,
|
|
258
|
+
tmp_path,
|
|
259
|
+
mock_eval_config,
|
|
260
|
+
mock_run_config,
|
|
261
|
+
):
|
|
262
|
+
# second eval config has a different task
|
|
263
|
+
eval_config = EvalConfig(
|
|
264
|
+
name="test2",
|
|
265
|
+
model_name="gpt-4",
|
|
266
|
+
model_provider="openai",
|
|
267
|
+
properties={
|
|
268
|
+
"eval_steps": ["step1", "step2", "step3"],
|
|
269
|
+
},
|
|
270
|
+
parent=Eval(
|
|
271
|
+
name="test",
|
|
272
|
+
description="test",
|
|
273
|
+
eval_set_filter_id="all",
|
|
274
|
+
eval_configs_filter_id="all",
|
|
275
|
+
output_scores=[
|
|
276
|
+
EvalOutputScore(
|
|
277
|
+
name="Accuracy",
|
|
278
|
+
instruction="Check if the output is accurate",
|
|
279
|
+
type=TaskOutputRatingType.pass_fail,
|
|
280
|
+
),
|
|
281
|
+
],
|
|
282
|
+
parent=Task(
|
|
283
|
+
name="test",
|
|
284
|
+
description="test",
|
|
285
|
+
instruction="do the thing",
|
|
286
|
+
),
|
|
287
|
+
),
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
with pytest.raises(
|
|
291
|
+
ValueError, match="All eval configs must have the same parent eval"
|
|
292
|
+
):
|
|
293
|
+
EvalRunner(
|
|
294
|
+
eval_configs=[mock_eval_config, eval_config],
|
|
295
|
+
run_configs=[mock_run_config],
|
|
296
|
+
eval_run_type="eval_config_eval",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def test_collect_tasks_excludes_already_run_task_run_eval(
|
|
301
|
+
mock_eval_runner, mock_task, data_source, mock_eval_config, mock_run_config
|
|
302
|
+
):
|
|
303
|
+
"""Test that already run tasks are excluded"""
|
|
304
|
+
# Create a task run
|
|
305
|
+
task_run = TaskRun(
|
|
306
|
+
parent=mock_task,
|
|
307
|
+
input="test",
|
|
308
|
+
input_source=data_source,
|
|
309
|
+
tags=["tag1"],
|
|
310
|
+
output=TaskOutput(
|
|
311
|
+
output="test",
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
task_run.save_to_file()
|
|
315
|
+
|
|
316
|
+
# Prior to any eval runs, we should get the task run
|
|
317
|
+
jobs = mock_eval_runner.collect_tasks()
|
|
318
|
+
assert len(jobs) == 1
|
|
319
|
+
assert jobs[0].item.id == task_run.id
|
|
320
|
+
assert jobs[0].task_run_config.id == mock_run_config.id
|
|
321
|
+
assert jobs[0].eval_config.id == mock_eval_config.id
|
|
322
|
+
|
|
323
|
+
# Create an eval run for this task
|
|
324
|
+
EvalRun(
|
|
325
|
+
parent=mock_eval_config,
|
|
326
|
+
dataset_id=task_run.id,
|
|
327
|
+
task_run_config_id=mock_run_config.id,
|
|
328
|
+
input="test",
|
|
329
|
+
output="test",
|
|
330
|
+
scores={"accuracy": 1.0},
|
|
331
|
+
).save_to_file()
|
|
332
|
+
|
|
333
|
+
# Set filter to match the task
|
|
334
|
+
mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
|
|
335
|
+
mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
|
|
336
|
+
|
|
337
|
+
jobs = mock_eval_runner.collect_tasks()
|
|
338
|
+
|
|
339
|
+
# Should get no jobs since the task was already run
|
|
340
|
+
assert len(jobs) == 0
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def test_collect_tasks_excludes_already_run_eval_config_eval(
|
|
344
|
+
mock_task, data_source, mock_eval_config, mock_eval, mock_run_config
|
|
345
|
+
):
|
|
346
|
+
"""Test that already run tasks are excluded"""
|
|
347
|
+
# Create a task run
|
|
348
|
+
task_run = TaskRun(
|
|
349
|
+
parent=mock_task,
|
|
350
|
+
input="test",
|
|
351
|
+
input_source=data_source,
|
|
352
|
+
tags=["tag1"],
|
|
353
|
+
output=TaskOutput(
|
|
354
|
+
output="test",
|
|
355
|
+
),
|
|
356
|
+
)
|
|
357
|
+
task_run.save_to_file()
|
|
358
|
+
|
|
359
|
+
mock_eval.eval_set_filter_id = "tag::nonexistent"
|
|
360
|
+
mock_eval.eval_configs_filter_id = "tag::tag1"
|
|
361
|
+
mock_eval.save_to_file()
|
|
362
|
+
|
|
363
|
+
# Prior to any eval runs, we should get 1 job for the eval config
|
|
364
|
+
runner = EvalRunner(
|
|
365
|
+
eval_configs=[mock_eval_config],
|
|
366
|
+
run_configs=None,
|
|
367
|
+
eval_run_type="eval_config_eval",
|
|
368
|
+
)
|
|
369
|
+
jobs = runner.collect_tasks()
|
|
370
|
+
assert len(jobs) == 1
|
|
371
|
+
assert jobs[0].item.id == task_run.id
|
|
372
|
+
assert jobs[0].eval_config.id == mock_eval_config.id
|
|
373
|
+
assert jobs[0].task_run_config is None
|
|
374
|
+
|
|
375
|
+
# Create an eval run for this eval config task run pair, so now we should get no jobs (already run)
|
|
376
|
+
EvalRun(
|
|
377
|
+
parent=mock_eval_config,
|
|
378
|
+
dataset_id=task_run.id,
|
|
379
|
+
task_run_config_id=None,
|
|
380
|
+
eval_config_eval=True,
|
|
381
|
+
input="test",
|
|
382
|
+
output="test",
|
|
383
|
+
scores={
|
|
384
|
+
"accuracy": 1.0,
|
|
385
|
+
},
|
|
386
|
+
).save_to_file()
|
|
387
|
+
|
|
388
|
+
jobs = runner.collect_tasks()
|
|
389
|
+
|
|
390
|
+
# Should get no jobs since the task was already run
|
|
391
|
+
assert len(jobs) == 0
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def test_collect_tasks_multiple_run_configs(
|
|
395
|
+
mock_eval_runner, mock_task, data_source, mock_run_config
|
|
396
|
+
):
|
|
397
|
+
"""Test handling multiple run configs"""
|
|
398
|
+
# Create a task run
|
|
399
|
+
task_run = TaskRun(
|
|
400
|
+
parent=mock_task,
|
|
401
|
+
input="test",
|
|
402
|
+
input_source=data_source,
|
|
403
|
+
tags=["tag1"],
|
|
404
|
+
output=TaskOutput(
|
|
405
|
+
output="test",
|
|
406
|
+
),
|
|
407
|
+
)
|
|
408
|
+
task_run.save_to_file()
|
|
409
|
+
|
|
410
|
+
# Add another run config
|
|
411
|
+
second_config = TaskRunConfig(
|
|
412
|
+
name="test2",
|
|
413
|
+
description="test2",
|
|
414
|
+
run_config_properties=RunConfigProperties(
|
|
415
|
+
model_name="gpt-3.5",
|
|
416
|
+
model_provider_name="openai",
|
|
417
|
+
prompt_id="simple_prompt_builder",
|
|
418
|
+
),
|
|
419
|
+
parent=mock_task,
|
|
420
|
+
)
|
|
421
|
+
second_config.save_to_file()
|
|
422
|
+
mock_eval_runner.run_configs.append(second_config)
|
|
423
|
+
|
|
424
|
+
# Set filter to match the task
|
|
425
|
+
mock_eval_runner.eval.eval_set_filter_id = "tag::tag1"
|
|
426
|
+
|
|
427
|
+
jobs = mock_eval_runner.collect_tasks()
|
|
428
|
+
|
|
429
|
+
# Should get 2 jobs, one for each config
|
|
430
|
+
assert len(jobs) == 2
|
|
431
|
+
assert {job.task_run_config.id for job in jobs} == {
|
|
432
|
+
second_config.id,
|
|
433
|
+
mock_run_config.id,
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source):
|
|
438
|
+
"""Test empty cases - no matching tasks or no tasks at all"""
|
|
439
|
+
# Set filter that won't match anything
|
|
440
|
+
mock_eval_runner.eval.eval_set_filter_id = "tag::nonexistent"
|
|
441
|
+
mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent"
|
|
442
|
+
|
|
443
|
+
jobs = mock_eval_runner.collect_tasks()
|
|
444
|
+
assert len(jobs) == 0
|
|
445
|
+
|
|
446
|
+
# Create task run with non-matching tag
|
|
447
|
+
task_run = TaskRun(
|
|
448
|
+
parent=mock_task,
|
|
449
|
+
input="test",
|
|
450
|
+
input_source=data_source,
|
|
451
|
+
tags=["other_tag"],
|
|
452
|
+
output=TaskOutput(
|
|
453
|
+
output="test",
|
|
454
|
+
),
|
|
455
|
+
)
|
|
456
|
+
task_run.save_to_file()
|
|
457
|
+
|
|
458
|
+
jobs = mock_eval_runner.collect_tasks()
|
|
459
|
+
assert len(jobs) == 0
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@pytest.mark.asyncio
|
|
463
|
+
async def test_run_job_success_task_run_eval(
|
|
464
|
+
mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
|
|
465
|
+
):
|
|
466
|
+
# Create a task run to evaluate
|
|
467
|
+
task_run = TaskRun(
|
|
468
|
+
parent=mock_task,
|
|
469
|
+
input="test input",
|
|
470
|
+
input_source=data_source,
|
|
471
|
+
output=TaskOutput(output="test output"),
|
|
472
|
+
)
|
|
473
|
+
task_run.save_to_file()
|
|
474
|
+
|
|
475
|
+
# Create eval job
|
|
476
|
+
job = EvalJob(
|
|
477
|
+
item=task_run,
|
|
478
|
+
task_run_config=mock_run_config,
|
|
479
|
+
type="task_run_eval",
|
|
480
|
+
eval_config=mock_eval_config,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Mock the evaluator
|
|
484
|
+
mock_result_run = TaskRun(
|
|
485
|
+
input="test input",
|
|
486
|
+
input_source=data_source,
|
|
487
|
+
output=TaskOutput(output="evaluated output"),
|
|
488
|
+
intermediate_outputs={"intermediate_output": "intermediate output"},
|
|
489
|
+
)
|
|
490
|
+
mock_scores = {"accuracy": 0.95}
|
|
491
|
+
|
|
492
|
+
class MockEvaluator(BaseEval):
|
|
493
|
+
async def run_task_and_eval(self, input_text):
|
|
494
|
+
return (
|
|
495
|
+
mock_result_run,
|
|
496
|
+
mock_scores,
|
|
497
|
+
{"intermediate_output": "intermediate output"},
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
with patch(
|
|
501
|
+
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
|
|
502
|
+
return_value=lambda *args: MockEvaluator(*args),
|
|
503
|
+
):
|
|
504
|
+
success = await mock_eval_runner.run_job(job)
|
|
505
|
+
|
|
506
|
+
assert success is True
|
|
507
|
+
|
|
508
|
+
# Verify eval run was saved
|
|
509
|
+
eval_runs = mock_eval_config.runs()
|
|
510
|
+
assert len(eval_runs) == 1
|
|
511
|
+
saved_run = eval_runs[0]
|
|
512
|
+
assert saved_run.dataset_id == task_run.id
|
|
513
|
+
assert saved_run.task_run_config_id == mock_run_config.id
|
|
514
|
+
assert saved_run.scores == mock_scores
|
|
515
|
+
assert saved_run.input == "test input"
|
|
516
|
+
assert saved_run.output == "evaluated output"
|
|
517
|
+
assert saved_run.intermediate_outputs == {
|
|
518
|
+
"intermediate_output": "intermediate output"
|
|
519
|
+
}
|
|
520
|
+
assert saved_run.parent_eval_config().id == mock_eval_config.id
|
|
521
|
+
assert saved_run.eval_config_eval is False
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
@pytest.mark.asyncio
|
|
525
|
+
async def test_run_job_success_eval_config_eval(
|
|
526
|
+
mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
|
|
527
|
+
):
|
|
528
|
+
# Create a task run to evaluate
|
|
529
|
+
task_run = TaskRun(
|
|
530
|
+
parent=mock_task,
|
|
531
|
+
input="test input",
|
|
532
|
+
input_source=data_source,
|
|
533
|
+
output=TaskOutput(output="test output"),
|
|
534
|
+
)
|
|
535
|
+
task_run.save_to_file()
|
|
536
|
+
|
|
537
|
+
# Create eval job
|
|
538
|
+
job = EvalJob(
|
|
539
|
+
item=task_run,
|
|
540
|
+
type="eval_config_eval",
|
|
541
|
+
eval_config=mock_eval_config,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Mock the evaluator
|
|
545
|
+
mock_result_run = TaskRun(
|
|
546
|
+
input="test input",
|
|
547
|
+
input_source=data_source,
|
|
548
|
+
output=TaskOutput(output="evaluated output"),
|
|
549
|
+
)
|
|
550
|
+
mock_scores: EvalScores = {"accuracy": 0.95}
|
|
551
|
+
|
|
552
|
+
class MockEvaluator(BaseEval):
|
|
553
|
+
async def run_task_and_eval(self, input_text):
|
|
554
|
+
raise ValueError("Attempted to run task and eval for a config eval")
|
|
555
|
+
|
|
556
|
+
async def run_eval(
|
|
557
|
+
self, task_run: TaskRun
|
|
558
|
+
) -> tuple[EvalScores, Dict[str, str] | None]:
|
|
559
|
+
return mock_scores, {"intermediate_output": "intermediate output"}
|
|
560
|
+
|
|
561
|
+
with patch(
|
|
562
|
+
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
|
|
563
|
+
return_value=lambda *args: MockEvaluator(*args),
|
|
564
|
+
):
|
|
565
|
+
success = await mock_eval_runner.run_job(job)
|
|
566
|
+
|
|
567
|
+
assert success is True
|
|
568
|
+
|
|
569
|
+
# Verify eval run was saved
|
|
570
|
+
eval_runs = mock_eval_config.runs()
|
|
571
|
+
assert len(eval_runs) == 1
|
|
572
|
+
saved_run = eval_runs[0]
|
|
573
|
+
assert saved_run.dataset_id == task_run.id
|
|
574
|
+
assert saved_run.task_run_config_id is None
|
|
575
|
+
assert saved_run.scores == mock_scores
|
|
576
|
+
assert saved_run.input == "test input"
|
|
577
|
+
assert saved_run.output == "test output"
|
|
578
|
+
assert saved_run.parent_eval_config().id == mock_eval_config.id
|
|
579
|
+
assert saved_run.eval_config_eval is True
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@pytest.mark.asyncio
|
|
583
|
+
async def test_run_job_invalid_evaluator(
|
|
584
|
+
mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
|
|
585
|
+
):
|
|
586
|
+
task_run = TaskRun(
|
|
587
|
+
parent=mock_task,
|
|
588
|
+
input="test input",
|
|
589
|
+
input_source=data_source,
|
|
590
|
+
output=TaskOutput(output="test output"),
|
|
591
|
+
)
|
|
592
|
+
task_run.save_to_file()
|
|
593
|
+
job = EvalJob(
|
|
594
|
+
item=task_run,
|
|
595
|
+
task_run_config=mock_run_config,
|
|
596
|
+
type="task_run_eval",
|
|
597
|
+
eval_config=mock_eval_config,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Return an invalid evaluator type
|
|
601
|
+
with patch(
|
|
602
|
+
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
|
|
603
|
+
return_value=lambda *args: object(),
|
|
604
|
+
):
|
|
605
|
+
success = await mock_eval_runner.run_job(job)
|
|
606
|
+
|
|
607
|
+
assert success is False
|
|
608
|
+
assert len(mock_eval_config.runs()) == 0
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
@pytest.mark.asyncio
|
|
612
|
+
async def test_run_job_evaluator_error(
|
|
613
|
+
mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config
|
|
614
|
+
):
|
|
615
|
+
task_run = TaskRun(
|
|
616
|
+
parent=mock_task,
|
|
617
|
+
input="test input",
|
|
618
|
+
input_source=data_source,
|
|
619
|
+
output=TaskOutput(output="test output"),
|
|
620
|
+
)
|
|
621
|
+
task_run.save_to_file()
|
|
622
|
+
job = EvalJob(
|
|
623
|
+
item=task_run,
|
|
624
|
+
task_run_config=mock_run_config,
|
|
625
|
+
type="task_run_eval",
|
|
626
|
+
eval_config=mock_eval_config,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
class ErrorEvaluator(BaseEval):
|
|
630
|
+
async def run_task_and_eval(self, input_text):
|
|
631
|
+
raise ValueError("Evaluation failed")
|
|
632
|
+
|
|
633
|
+
with patch(
|
|
634
|
+
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
|
|
635
|
+
return_value=lambda *args: ErrorEvaluator(*args),
|
|
636
|
+
):
|
|
637
|
+
success = await mock_eval_runner.run_job(job)
|
|
638
|
+
|
|
639
|
+
assert success is False
|
|
640
|
+
assert len(mock_eval_config.runs()) == 0
|