kiln-ai 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +2 -0
- kiln_ai/adapters/adapter_registry.py +22 -44
- kiln_ai/adapters/chat/__init__.py +8 -0
- kiln_ai/adapters/chat/chat_formatter.py +234 -0
- kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
- kiln_ai/adapters/eval/base_eval.py +8 -6
- kiln_ai/adapters/eval/eval_runner.py +9 -65
- kiln_ai/adapters/eval/g_eval.py +26 -8
- kiln_ai/adapters/eval/test_base_eval.py +166 -15
- kiln_ai/adapters/eval/test_eval_runner.py +3 -0
- kiln_ai/adapters/eval/test_g_eval.py +1 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
- kiln_ai/adapters/fine_tune/dataset_formatter.py +153 -197
- kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +402 -211
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
- kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
- kiln_ai/adapters/ml_model_list.py +556 -45
- kiln_ai/adapters/model_adapters/base_adapter.py +100 -35
- kiln_ai/adapters/model_adapters/litellm_adapter.py +116 -100
- kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
- kiln_ai/adapters/model_adapters/test_base_adapter.py +299 -52
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +121 -22
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +44 -2
- kiln_ai/adapters/model_adapters/test_structured_output.py +48 -18
- kiln_ai/adapters/parsers/base_parser.py +0 -3
- kiln_ai/adapters/parsers/parser_registry.py +5 -3
- kiln_ai/adapters/parsers/r1_parser.py +17 -2
- kiln_ai/adapters/parsers/request_formatters.py +40 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
- kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
- kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
- kiln_ai/adapters/prompt_builders.py +14 -17
- kiln_ai/adapters/provider_tools.py +39 -4
- kiln_ai/adapters/repair/test_repair_task.py +27 -5
- kiln_ai/adapters/test_adapter_registry.py +88 -28
- kiln_ai/adapters/test_ml_model_list.py +158 -0
- kiln_ai/adapters/test_prompt_adaptors.py +17 -3
- kiln_ai/adapters/test_prompt_builders.py +27 -19
- kiln_ai/adapters/test_provider_tools.py +130 -12
- kiln_ai/datamodel/__init__.py +2 -2
- kiln_ai/datamodel/datamodel_enums.py +43 -4
- kiln_ai/datamodel/dataset_filters.py +69 -1
- kiln_ai/datamodel/dataset_split.py +4 -0
- kiln_ai/datamodel/eval.py +8 -0
- kiln_ai/datamodel/finetune.py +13 -7
- kiln_ai/datamodel/prompt_id.py +1 -0
- kiln_ai/datamodel/task.py +68 -7
- kiln_ai/datamodel/task_output.py +1 -1
- kiln_ai/datamodel/task_run.py +39 -7
- kiln_ai/datamodel/test_basemodel.py +5 -8
- kiln_ai/datamodel/test_dataset_filters.py +82 -0
- kiln_ai/datamodel/test_dataset_split.py +2 -8
- kiln_ai/datamodel/test_example_models.py +54 -0
- kiln_ai/datamodel/test_models.py +80 -9
- kiln_ai/datamodel/test_task.py +168 -2
- kiln_ai/utils/async_job_runner.py +106 -0
- kiln_ai/utils/config.py +3 -2
- kiln_ai/utils/dataset_import.py +81 -19
- kiln_ai/utils/logging.py +165 -0
- kiln_ai/utils/test_async_job_runner.py +199 -0
- kiln_ai/utils/test_config.py +23 -0
- kiln_ai/utils/test_dataset_import.py +272 -10
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
- kiln_ai-0.17.0.dist-info/RECORD +113 -0
- kiln_ai-0.15.0.dist-info/RECORD +0 -104
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import logging
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from typing import AsyncGenerator, Dict, List, Literal, Set
|
|
@@ -10,6 +9,7 @@ from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id
|
|
|
10
9
|
from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores
|
|
11
10
|
from kiln_ai.datamodel.task import TaskRunConfig
|
|
12
11
|
from kiln_ai.datamodel.task_run import TaskRun
|
|
12
|
+
from kiln_ai.utils.async_job_runner import AsyncJobRunner, Progress
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -23,13 +23,6 @@ class EvalJob:
|
|
|
23
23
|
task_run_config: TaskRunConfig | None = None
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
@dataclass
|
|
27
|
-
class EvalProgress:
|
|
28
|
-
complete: int | None = None
|
|
29
|
-
total: int | None = None
|
|
30
|
-
errors: int | None = None
|
|
31
|
-
|
|
32
|
-
|
|
33
26
|
class EvalRunner:
|
|
34
27
|
"""
|
|
35
28
|
Runs an eval. Async execution is supported to make it faster when using remote/fast model providers.
|
|
@@ -161,67 +154,15 @@ class EvalRunner:
|
|
|
161
154
|
if task_run.id not in already_run[eval_config.id][run_config.id]
|
|
162
155
|
]
|
|
163
156
|
|
|
164
|
-
async def run(self, concurrency: int = 25) -> AsyncGenerator[
|
|
157
|
+
async def run(self, concurrency: int = 25) -> AsyncGenerator[Progress, None]:
|
|
165
158
|
"""
|
|
166
159
|
Runs the configured eval run with parallel workers and yields progress updates.
|
|
167
160
|
"""
|
|
168
161
|
jobs = self.collect_tasks()
|
|
169
162
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# Send initial status
|
|
175
|
-
yield EvalProgress(complete=complete, total=total, errors=errors)
|
|
176
|
-
|
|
177
|
-
worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue()
|
|
178
|
-
for job in jobs:
|
|
179
|
-
worker_queue.put_nowait(job)
|
|
180
|
-
|
|
181
|
-
# simple status queue to return progress. True=success, False=error
|
|
182
|
-
status_queue: asyncio.Queue[bool] = asyncio.Queue()
|
|
183
|
-
|
|
184
|
-
workers = []
|
|
185
|
-
for i in range(concurrency):
|
|
186
|
-
task = asyncio.create_task(self.run_worker(worker_queue, status_queue))
|
|
187
|
-
workers.append(task)
|
|
188
|
-
|
|
189
|
-
# Send status updates until workers are done, and they are all sent
|
|
190
|
-
while not status_queue.empty() or not all(worker.done() for worker in workers):
|
|
191
|
-
try:
|
|
192
|
-
# Use timeout to prevent hanging if all workers complete
|
|
193
|
-
# between our while condition check and get()
|
|
194
|
-
success = await asyncio.wait_for(status_queue.get(), timeout=0.1)
|
|
195
|
-
if success:
|
|
196
|
-
complete += 1
|
|
197
|
-
else:
|
|
198
|
-
errors += 1
|
|
199
|
-
|
|
200
|
-
yield EvalProgress(complete=complete, total=total, errors=errors)
|
|
201
|
-
except asyncio.TimeoutError:
|
|
202
|
-
# Timeout is expected, just continue to recheck worker status
|
|
203
|
-
# Don't love this but beats sentinels for reliability
|
|
204
|
-
continue
|
|
205
|
-
|
|
206
|
-
# These are redundant, but keeping them will catch async errors
|
|
207
|
-
await asyncio.gather(*workers)
|
|
208
|
-
await worker_queue.join()
|
|
209
|
-
|
|
210
|
-
async def run_worker(
|
|
211
|
-
self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool]
|
|
212
|
-
):
|
|
213
|
-
while True:
|
|
214
|
-
try:
|
|
215
|
-
job = worker_queue.get_nowait()
|
|
216
|
-
except asyncio.QueueEmpty:
|
|
217
|
-
# worker can end when the queue is empty
|
|
218
|
-
break
|
|
219
|
-
try:
|
|
220
|
-
success = await self.run_job(job)
|
|
221
|
-
await status_queue.put(success)
|
|
222
|
-
finally:
|
|
223
|
-
# Always mark the dequeued task as done, even on exceptions
|
|
224
|
-
worker_queue.task_done()
|
|
163
|
+
runner = AsyncJobRunner(concurrency=concurrency)
|
|
164
|
+
async for progress in runner.run(jobs, self.run_job):
|
|
165
|
+
yield progress
|
|
225
166
|
|
|
226
167
|
async def run_job(self, job: EvalJob) -> bool:
|
|
227
168
|
try:
|
|
@@ -266,5 +207,8 @@ class EvalRunner:
|
|
|
266
207
|
|
|
267
208
|
return True
|
|
268
209
|
except Exception as e:
|
|
269
|
-
logger.error(
|
|
210
|
+
logger.error(
|
|
211
|
+
f"Error running eval job for dataset item {job.item.id}: {e}",
|
|
212
|
+
exc_info=True,
|
|
213
|
+
)
|
|
270
214
|
return False
|
kiln_ai/adapters/eval/g_eval.py
CHANGED
|
@@ -5,11 +5,14 @@ from litellm.types.utils import ChatCompletionTokenLogprob
|
|
|
5
5
|
|
|
6
6
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
7
7
|
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
8
|
+
from kiln_ai.adapters.ml_model_list import (
|
|
9
|
+
default_structured_output_mode_for_model_provider,
|
|
10
|
+
)
|
|
8
11
|
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
|
|
9
12
|
from kiln_ai.adapters.prompt_builders import PromptGenerators
|
|
10
13
|
from kiln_ai.datamodel import Project, Task, TaskRun
|
|
11
14
|
from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
|
|
12
|
-
from kiln_ai.datamodel.task import RunConfig
|
|
15
|
+
from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, StructuredOutputMode
|
|
13
16
|
|
|
14
17
|
# all the tokens we score for, and their float scores.
|
|
15
18
|
TOKEN_TO_SCORE_MAP: Dict[str, float] = {
|
|
@@ -43,9 +46,9 @@ class GEvalTask(Task, parent_of={}):
|
|
|
43
46
|
|
|
44
47
|
# Build the COT eval instructions
|
|
45
48
|
cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n"
|
|
46
|
-
steps = eval_config.properties.get("eval_steps",
|
|
47
|
-
if not
|
|
48
|
-
raise ValueError("eval_steps must be a list")
|
|
49
|
+
steps = eval_config.properties.get("eval_steps", [])
|
|
50
|
+
if not isinstance(steps, list):
|
|
51
|
+
raise ValueError("eval_steps must be a list.")
|
|
49
52
|
for i, step in enumerate(steps):
|
|
50
53
|
cot_instructions += f"{i + 1}) {step}\n"
|
|
51
54
|
|
|
@@ -114,12 +117,27 @@ class GEval(BaseEval):
|
|
|
114
117
|
10 if self.eval_config.config_type == EvalConfigType.g_eval else None
|
|
115
118
|
)
|
|
116
119
|
|
|
117
|
-
|
|
118
|
-
|
|
120
|
+
# We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
|
|
121
|
+
structured_output_mode = default_structured_output_mode_for_model_provider(
|
|
119
122
|
model_name,
|
|
120
123
|
provider,
|
|
121
|
-
|
|
122
|
-
|
|
124
|
+
default=StructuredOutputMode.json_schema,
|
|
125
|
+
# G-eval expects JSON, so don't allow function calling modes
|
|
126
|
+
disallowed_modes=[
|
|
127
|
+
StructuredOutputMode.function_calling,
|
|
128
|
+
StructuredOutputMode.function_calling_weak,
|
|
129
|
+
],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
adapter = adapter_for_task(
|
|
133
|
+
self.geval_task,
|
|
134
|
+
run_config_properties=RunConfigProperties(
|
|
135
|
+
model_name=model_name,
|
|
136
|
+
model_provider_name=provider,
|
|
137
|
+
# We always use Simple COT for G-Eval and LLM as Judge
|
|
138
|
+
prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
|
|
139
|
+
structured_output_mode=structured_output_mode,
|
|
140
|
+
),
|
|
123
141
|
base_adapter_config=AdapterConfig(
|
|
124
142
|
# Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
|
|
125
143
|
allow_saving=False,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
2
3
|
|
|
3
4
|
import pytest
|
|
4
5
|
|
|
5
6
|
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
6
|
-
from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
|
|
7
7
|
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
|
|
8
8
|
from kiln_ai.datamodel.task import (
|
|
9
9
|
RunConfigProperties,
|
|
@@ -245,7 +245,7 @@ class EvalTester(BaseEval):
|
|
|
245
245
|
"""Test implementation of BaseEval"""
|
|
246
246
|
|
|
247
247
|
async def run_eval(self, task_run):
|
|
248
|
-
return {"overall_rating": 5, "quality": 4}
|
|
248
|
+
return {"overall_rating": 5, "quality": 4}, None
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
@pytest.mark.paid
|
|
@@ -265,14 +265,8 @@ async def test_run_method():
|
|
|
265
265
|
|
|
266
266
|
eval_config = EvalConfig(
|
|
267
267
|
name="Test Eval Config",
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
properties={
|
|
271
|
-
"model_name": "gpt-4o",
|
|
272
|
-
"model_provider": "openai",
|
|
273
|
-
"adapter_name": "test",
|
|
274
|
-
},
|
|
275
|
-
),
|
|
268
|
+
model_name="gpt-4o",
|
|
269
|
+
model_provider="openai",
|
|
276
270
|
parent=Eval(
|
|
277
271
|
name="Test Eval",
|
|
278
272
|
parent=task,
|
|
@@ -291,10 +285,6 @@ async def test_run_method():
|
|
|
291
285
|
),
|
|
292
286
|
],
|
|
293
287
|
),
|
|
294
|
-
prompt=BasePrompt(
|
|
295
|
-
name="Test Prompt",
|
|
296
|
-
prompt="Test prompt",
|
|
297
|
-
),
|
|
298
288
|
properties={"eval_steps": ["test_step"]},
|
|
299
289
|
)
|
|
300
290
|
|
|
@@ -311,7 +301,9 @@ async def test_run_method():
|
|
|
311
301
|
evaluator = EvalTester(eval_config, run_config.run_config())
|
|
312
302
|
|
|
313
303
|
# Run the evaluation
|
|
314
|
-
task_run, eval_scores = await evaluator.
|
|
304
|
+
task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
|
|
305
|
+
"test input"
|
|
306
|
+
)
|
|
315
307
|
|
|
316
308
|
# Verify task run was created
|
|
317
309
|
assert task_run.input == "test input"
|
|
@@ -323,3 +315,162 @@ async def test_run_method():
|
|
|
323
315
|
|
|
324
316
|
# Verify schema validation worked (these keys should exist per schema)
|
|
325
317
|
assert set(eval_scores.keys()) == {"overall_rating", "quality"}
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@pytest.mark.asyncio
|
|
321
|
+
async def test_run_task_and_eval():
|
|
322
|
+
"""Test run_task_and_eval method with mocked dependencies"""
|
|
323
|
+
# Create test data
|
|
324
|
+
task = Task(
|
|
325
|
+
name="Test Task",
|
|
326
|
+
instruction="Test instruction",
|
|
327
|
+
requirements=[
|
|
328
|
+
TaskRequirement(
|
|
329
|
+
name="Quality",
|
|
330
|
+
instruction="Rate quality",
|
|
331
|
+
type=TaskOutputRatingType.five_star,
|
|
332
|
+
),
|
|
333
|
+
],
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
eval_config = EvalConfig(
|
|
337
|
+
name="Test Eval Config",
|
|
338
|
+
model_name="gpt-4o",
|
|
339
|
+
model_provider="openai",
|
|
340
|
+
parent=Eval(
|
|
341
|
+
name="Test Eval",
|
|
342
|
+
parent=task,
|
|
343
|
+
eval_set_filter_id="all",
|
|
344
|
+
eval_configs_filter_id="all",
|
|
345
|
+
output_scores=[
|
|
346
|
+
EvalOutputScore(
|
|
347
|
+
name="Quality",
|
|
348
|
+
instruction="Rate quality",
|
|
349
|
+
type=TaskOutputRatingType.five_star,
|
|
350
|
+
),
|
|
351
|
+
EvalOutputScore(
|
|
352
|
+
name="Overall Rating",
|
|
353
|
+
instruction="The overall rating for the task output",
|
|
354
|
+
type=TaskOutputRatingType.five_star,
|
|
355
|
+
),
|
|
356
|
+
],
|
|
357
|
+
),
|
|
358
|
+
properties={"eval_steps": ["test_step"]},
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
run_config = TaskRunConfig(
|
|
362
|
+
name="Test Run Config",
|
|
363
|
+
run_config_properties=RunConfigProperties(
|
|
364
|
+
model_name="llama_3_1_8b",
|
|
365
|
+
model_provider_name="groq",
|
|
366
|
+
prompt_id="simple_prompt_builder",
|
|
367
|
+
structured_output_mode="json_schema",
|
|
368
|
+
),
|
|
369
|
+
parent=task,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Create evaluator instance
|
|
373
|
+
class MockEval(BaseEval):
|
|
374
|
+
async def run_eval(self, task_run):
|
|
375
|
+
return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
|
|
376
|
+
|
|
377
|
+
evaluator = MockEval(eval_config, run_config.run_config())
|
|
378
|
+
|
|
379
|
+
# Mock dependencies
|
|
380
|
+
mock_adapter = AsyncMock()
|
|
381
|
+
mock_task_run = MagicMock()
|
|
382
|
+
mock_task_run.input = "test input"
|
|
383
|
+
mock_task_run.output.output = "test output"
|
|
384
|
+
mock_adapter.invoke.return_value = mock_task_run
|
|
385
|
+
|
|
386
|
+
with (
|
|
387
|
+
patch(
|
|
388
|
+
"kiln_ai.adapters.eval.base_eval.adapter_for_task"
|
|
389
|
+
) as mock_adapter_for_task,
|
|
390
|
+
patch(
|
|
391
|
+
"kiln_ai.adapters.eval.base_eval.validate_schema_with_value_error"
|
|
392
|
+
) as mock_validate,
|
|
393
|
+
):
|
|
394
|
+
mock_adapter_for_task.return_value = mock_adapter
|
|
395
|
+
|
|
396
|
+
# Test with string input
|
|
397
|
+
result = await evaluator.run_task_and_eval("test input")
|
|
398
|
+
|
|
399
|
+
# Verify adapter_for_task was called with correct parameters
|
|
400
|
+
mock_adapter_for_task.assert_called_once()
|
|
401
|
+
assert mock_adapter_for_task.call_args[0][0] == evaluator.target_task
|
|
402
|
+
props = mock_adapter_for_task.call_args[0][1]
|
|
403
|
+
assert props.model_name == "llama_3_1_8b"
|
|
404
|
+
assert props.model_provider_name == "groq"
|
|
405
|
+
assert props.prompt_id == "simple_prompt_builder"
|
|
406
|
+
bac = mock_adapter_for_task.call_args[1]
|
|
407
|
+
assert bac["base_adapter_config"].allow_saving is False
|
|
408
|
+
|
|
409
|
+
# Verify the base_adapter_config has allow_saving=False
|
|
410
|
+
adapter_config = mock_adapter_for_task.call_args[1]["base_adapter_config"]
|
|
411
|
+
assert adapter_config.allow_saving is False
|
|
412
|
+
|
|
413
|
+
# Verify adapter.invoke was called with correct input
|
|
414
|
+
mock_adapter.invoke.assert_called_once_with("test input")
|
|
415
|
+
|
|
416
|
+
# Verify validate_schema_with_value_error was called
|
|
417
|
+
mock_validate.assert_called_once_with(
|
|
418
|
+
{"overall_rating": 5, "quality": 4},
|
|
419
|
+
evaluator.score_schema,
|
|
420
|
+
"Eval output does not match score schema.",
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Verify return values
|
|
424
|
+
task_run, eval_scores, intermediate_outputs = result
|
|
425
|
+
assert task_run == mock_task_run
|
|
426
|
+
assert eval_scores == {"overall_rating": 5, "quality": 4}
|
|
427
|
+
assert intermediate_outputs == {"thinking": "test thinking"}
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@pytest.mark.asyncio
|
|
431
|
+
async def test_run_task_and_eval_no_run_config():
|
|
432
|
+
"""Test run_task_and_eval raises error when run_config is None"""
|
|
433
|
+
task = Task(
|
|
434
|
+
name="Test Task",
|
|
435
|
+
instruction="Test instruction",
|
|
436
|
+
requirements=[
|
|
437
|
+
TaskRequirement(
|
|
438
|
+
name="Quality",
|
|
439
|
+
instruction="Rate quality",
|
|
440
|
+
type=TaskOutputRatingType.five_star,
|
|
441
|
+
),
|
|
442
|
+
],
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
eval_config = EvalConfig(
|
|
446
|
+
name="Test Eval Config",
|
|
447
|
+
model_name="gpt-4o",
|
|
448
|
+
model_provider="openai",
|
|
449
|
+
parent=Eval(
|
|
450
|
+
name="Test Eval",
|
|
451
|
+
parent=task,
|
|
452
|
+
eval_set_filter_id="all",
|
|
453
|
+
eval_configs_filter_id="all",
|
|
454
|
+
output_scores=[
|
|
455
|
+
EvalOutputScore(
|
|
456
|
+
name="Quality",
|
|
457
|
+
instruction="Rate quality",
|
|
458
|
+
type=TaskOutputRatingType.five_star,
|
|
459
|
+
),
|
|
460
|
+
],
|
|
461
|
+
),
|
|
462
|
+
properties={"eval_steps": ["test_step"]},
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
# Create evaluator instance with no run_config
|
|
466
|
+
class MockEval(BaseEval):
|
|
467
|
+
async def run_eval(self, task_run):
|
|
468
|
+
return {"quality": 4}, None
|
|
469
|
+
|
|
470
|
+
evaluator = MockEval(eval_config, None)
|
|
471
|
+
|
|
472
|
+
# Test that it raises ValueError
|
|
473
|
+
with pytest.raises(
|
|
474
|
+
ValueError, match="Run config is required for run_task_and_eval"
|
|
475
|
+
):
|
|
476
|
+
await evaluator.run_task_and_eval("test input")
|
|
@@ -94,6 +94,7 @@ def mock_run_config(
|
|
|
94
94
|
model_name="gpt-4",
|
|
95
95
|
model_provider_name="openai",
|
|
96
96
|
prompt_id="simple_prompt_builder",
|
|
97
|
+
structured_output_mode="json_schema",
|
|
97
98
|
),
|
|
98
99
|
parent=mock_task,
|
|
99
100
|
)
|
|
@@ -209,6 +210,7 @@ def test_collect_tasks_filtering(
|
|
|
209
210
|
model_name="gpt-4",
|
|
210
211
|
model_provider_name="openai",
|
|
211
212
|
prompt_id="simple_prompt_builder",
|
|
213
|
+
structured_output_mode="json_schema",
|
|
212
214
|
),
|
|
213
215
|
parent=mock_task,
|
|
214
216
|
)
|
|
@@ -416,6 +418,7 @@ def test_collect_tasks_multiple_run_configs(
|
|
|
416
418
|
model_name="gpt-3.5",
|
|
417
419
|
model_provider_name="openai",
|
|
418
420
|
prompt_id="simple_prompt_builder",
|
|
421
|
+
structured_output_mode="json_schema",
|
|
419
422
|
),
|
|
420
423
|
parent=mock_task,
|
|
421
424
|
)
|
|
@@ -6,11 +6,11 @@ from pydantic import BaseModel
|
|
|
6
6
|
from kiln_ai.adapters.ml_model_list import built_in_models
|
|
7
7
|
from kiln_ai.datamodel import (
|
|
8
8
|
DatasetSplit,
|
|
9
|
-
FinetuneDataStrategy,
|
|
10
9
|
FineTuneStatusType,
|
|
11
10
|
Task,
|
|
12
11
|
)
|
|
13
12
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
13
|
+
from kiln_ai.datamodel.datamodel_enums import ChatStrategy
|
|
14
14
|
from kiln_ai.utils.name_generator import generate_memorable_name
|
|
15
15
|
|
|
16
16
|
|
|
@@ -62,7 +62,7 @@ class BaseFinetuneAdapter(ABC):
|
|
|
62
62
|
train_split_name: str,
|
|
63
63
|
system_message: str,
|
|
64
64
|
thinking_instructions: str | None,
|
|
65
|
-
data_strategy:
|
|
65
|
+
data_strategy: ChatStrategy,
|
|
66
66
|
parameters: dict[str, str | int | float | bool] = {},
|
|
67
67
|
name: str | None = None,
|
|
68
68
|
description: str | None = None,
|