kiln-ai 0.16.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiln_ai/adapters/__init__.py +2 -0
- kiln_ai/adapters/adapter_registry.py +22 -44
- kiln_ai/adapters/chat/__init__.py +8 -0
- kiln_ai/adapters/chat/chat_formatter.py +233 -0
- kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
- kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
- kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
- kiln_ai/adapters/data_gen/test_data_gen_task.py +330 -40
- kiln_ai/adapters/eval/base_eval.py +7 -6
- kiln_ai/adapters/eval/eval_runner.py +9 -2
- kiln_ai/adapters/eval/g_eval.py +40 -17
- kiln_ai/adapters/eval/test_base_eval.py +174 -17
- kiln_ai/adapters/eval/test_eval_runner.py +3 -0
- kiln_ai/adapters/eval/test_g_eval.py +116 -5
- kiln_ai/adapters/fine_tune/base_finetune.py +3 -8
- kiln_ai/adapters/fine_tune/dataset_formatter.py +135 -273
- kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +287 -353
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +6 -11
- kiln_ai/adapters/fine_tune/together_finetune.py +13 -2
- kiln_ai/adapters/ml_model_list.py +370 -84
- kiln_ai/adapters/model_adapters/base_adapter.py +73 -26
- kiln_ai/adapters/model_adapters/litellm_adapter.py +88 -97
- kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
- kiln_ai/adapters/model_adapters/test_base_adapter.py +235 -61
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +104 -21
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +41 -0
- kiln_ai/adapters/model_adapters/test_structured_output.py +44 -12
- kiln_ai/adapters/parsers/parser_registry.py +0 -2
- kiln_ai/adapters/parsers/r1_parser.py +0 -1
- kiln_ai/adapters/prompt_builders.py +0 -16
- kiln_ai/adapters/provider_tools.py +27 -9
- kiln_ai/adapters/remote_config.py +66 -0
- kiln_ai/adapters/repair/repair_task.py +1 -6
- kiln_ai/adapters/repair/test_repair_task.py +24 -3
- kiln_ai/adapters/test_adapter_registry.py +88 -28
- kiln_ai/adapters/test_ml_model_list.py +176 -0
- kiln_ai/adapters/test_prompt_adaptors.py +17 -7
- kiln_ai/adapters/test_prompt_builders.py +3 -16
- kiln_ai/adapters/test_provider_tools.py +69 -20
- kiln_ai/adapters/test_remote_config.py +100 -0
- kiln_ai/datamodel/__init__.py +0 -2
- kiln_ai/datamodel/datamodel_enums.py +38 -13
- kiln_ai/datamodel/eval.py +32 -0
- kiln_ai/datamodel/finetune.py +12 -8
- kiln_ai/datamodel/task.py +68 -7
- kiln_ai/datamodel/task_output.py +0 -2
- kiln_ai/datamodel/task_run.py +0 -2
- kiln_ai/datamodel/test_basemodel.py +2 -1
- kiln_ai/datamodel/test_dataset_split.py +0 -8
- kiln_ai/datamodel/test_eval_model.py +146 -4
- kiln_ai/datamodel/test_models.py +33 -10
- kiln_ai/datamodel/test_task.py +168 -2
- kiln_ai/utils/config.py +3 -2
- kiln_ai/utils/dataset_import.py +1 -1
- kiln_ai/utils/logging.py +166 -0
- kiln_ai/utils/test_config.py +23 -0
- kiln_ai/utils/test_dataset_import.py +30 -0
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
- kiln_ai-0.18.0.dist-info/RECORD +115 -0
- kiln_ai-0.16.0.dist-info/RECORD +0 -108
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.16.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/adapters/eval/g_eval.py
CHANGED
|
@@ -5,11 +5,14 @@ from litellm.types.utils import ChatCompletionTokenLogprob
|
|
|
5
5
|
|
|
6
6
|
from kiln_ai.adapters.adapter_registry import adapter_for_task
|
|
7
7
|
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
8
|
+
from kiln_ai.adapters.ml_model_list import (
|
|
9
|
+
default_structured_output_mode_for_model_provider,
|
|
10
|
+
)
|
|
8
11
|
from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput
|
|
9
12
|
from kiln_ai.adapters.prompt_builders import PromptGenerators
|
|
10
13
|
from kiln_ai.datamodel import Project, Task, TaskRun
|
|
11
14
|
from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores
|
|
12
|
-
from kiln_ai.datamodel.task import RunConfig
|
|
15
|
+
from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, StructuredOutputMode
|
|
13
16
|
|
|
14
17
|
# all the tokens we score for, and their float scores.
|
|
15
18
|
TOKEN_TO_SCORE_MAP: Dict[str, float] = {
|
|
@@ -99,6 +102,18 @@ class GEval(BaseEval):
|
|
|
99
102
|
|
|
100
103
|
self.geval_task = GEvalTask(eval_config)
|
|
101
104
|
|
|
105
|
+
def generate_run_description(self, eval_input: str, eval_output: str) -> str:
|
|
106
|
+
return f"""The model was given the following input for the task:
|
|
107
|
+
<eval_data>
|
|
108
|
+
{eval_input}
|
|
109
|
+
</eval_data>
|
|
110
|
+
|
|
111
|
+
The model produced the following output for the task:
|
|
112
|
+
<eval_data>
|
|
113
|
+
{eval_output}
|
|
114
|
+
</eval_data>
|
|
115
|
+
"""
|
|
116
|
+
|
|
102
117
|
async def run_eval(
|
|
103
118
|
self, task_run: TaskRun
|
|
104
119
|
) -> tuple[EvalScores, Dict[str, str] | None]:
|
|
@@ -114,12 +129,27 @@ class GEval(BaseEval):
|
|
|
114
129
|
10 if self.eval_config.config_type == EvalConfigType.g_eval else None
|
|
115
130
|
)
|
|
116
131
|
|
|
117
|
-
|
|
118
|
-
|
|
132
|
+
# We don't expose setting this manually in the UI, so pull a recommended mode from ml_model_list
|
|
133
|
+
structured_output_mode = default_structured_output_mode_for_model_provider(
|
|
119
134
|
model_name,
|
|
120
135
|
provider,
|
|
121
|
-
|
|
122
|
-
|
|
136
|
+
default=StructuredOutputMode.json_schema,
|
|
137
|
+
# G-eval expects JSON, so don't allow function calling modes
|
|
138
|
+
disallowed_modes=[
|
|
139
|
+
StructuredOutputMode.function_calling,
|
|
140
|
+
StructuredOutputMode.function_calling_weak,
|
|
141
|
+
],
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
adapter = adapter_for_task(
|
|
145
|
+
self.geval_task,
|
|
146
|
+
run_config_properties=RunConfigProperties(
|
|
147
|
+
model_name=model_name,
|
|
148
|
+
model_provider_name=provider,
|
|
149
|
+
# We always use Simple COT for G-Eval and LLM as Judge
|
|
150
|
+
prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
|
|
151
|
+
structured_output_mode=structured_output_mode,
|
|
152
|
+
),
|
|
123
153
|
base_adapter_config=AdapterConfig(
|
|
124
154
|
# Don't save this run into the task_runs. It will be saved into an eval_run where it belongs
|
|
125
155
|
allow_saving=False,
|
|
@@ -127,19 +157,12 @@ class GEval(BaseEval):
|
|
|
127
157
|
),
|
|
128
158
|
)
|
|
129
159
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
</eval_data>
|
|
134
|
-
|
|
135
|
-
The model produced the following output for the task:
|
|
136
|
-
<eval_data>
|
|
137
|
-
{task_run.output}
|
|
138
|
-
</eval_data>
|
|
139
|
-
"""
|
|
160
|
+
run_description = self.generate_run_description(
|
|
161
|
+
task_run.input, task_run.output.output
|
|
162
|
+
)
|
|
140
163
|
|
|
141
164
|
# We don't need the run, but invoke_returning_run_output() runs validations for us over _run()
|
|
142
|
-
_, run_output = await adapter.invoke_returning_run_output(
|
|
165
|
+
_, run_output = await adapter.invoke_returning_run_output(run_description)
|
|
143
166
|
|
|
144
167
|
if self.eval_config.config_type == EvalConfigType.llm_as_judge:
|
|
145
168
|
return self.build_llm_as_judge_score(
|
|
@@ -292,7 +315,7 @@ The model produced the following output for the task:
|
|
|
292
315
|
"""
|
|
293
316
|
primary_token_score = self.score_from_token_string(token_logprob.token)
|
|
294
317
|
# check this is a real rating token, it could just be the ": ", "," or whitespace
|
|
295
|
-
if
|
|
318
|
+
if primary_token_score is None:
|
|
296
319
|
return None
|
|
297
320
|
|
|
298
321
|
total_score = 0.0
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
2
3
|
|
|
3
4
|
import pytest
|
|
4
5
|
|
|
5
6
|
from kiln_ai.adapters.eval.base_eval import BaseEval
|
|
6
|
-
from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType
|
|
7
7
|
from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore
|
|
8
8
|
from kiln_ai.datamodel.task import (
|
|
9
9
|
RunConfigProperties,
|
|
@@ -43,7 +43,9 @@ def test_score_schema_five_star():
|
|
|
43
43
|
|
|
44
44
|
# Check score property, and that it's an enum of 1-5
|
|
45
45
|
score_prop = schema["properties"]["quality_score"]
|
|
46
|
-
assert score_prop["
|
|
46
|
+
assert score_prop["type"] == "integer"
|
|
47
|
+
assert score_prop["minimum"] == 1
|
|
48
|
+
assert score_prop["maximum"] == 5
|
|
47
49
|
assert "Quality Score" in score_prop["title"]
|
|
48
50
|
assert "Rate the quality" in score_prop["description"]
|
|
49
51
|
assert "between 1 and 5" in score_prop["description"]
|
|
@@ -51,7 +53,9 @@ def test_score_schema_five_star():
|
|
|
51
53
|
# Check overall rating property, and that it's an enum of 1-5
|
|
52
54
|
assert "overall_rating" in schema["properties"]
|
|
53
55
|
overall = schema["properties"]["overall_rating"]
|
|
54
|
-
assert overall["
|
|
56
|
+
assert overall["type"] == "integer"
|
|
57
|
+
assert overall["minimum"] == 1
|
|
58
|
+
assert overall["maximum"] == 5
|
|
55
59
|
assert "Overall Rating" in overall["title"]
|
|
56
60
|
assert "The overall rating for the task output" in overall["description"]
|
|
57
61
|
assert "between 1 and 5" in overall["description"]
|
|
@@ -127,6 +131,7 @@ def test_score_schema_pass_fail():
|
|
|
127
131
|
schema = json.loads(schema_str)
|
|
128
132
|
|
|
129
133
|
score_prop = schema["properties"]["pass_fail_test"]
|
|
134
|
+
assert score_prop["type"] == "string"
|
|
130
135
|
assert score_prop["enum"] == ["pass", "fail"]
|
|
131
136
|
assert "Pass Fail Test" in score_prop["title"]
|
|
132
137
|
assert "Check if it passes" in score_prop["description"]
|
|
@@ -173,6 +178,7 @@ def test_score_schema_pass_fail_critical():
|
|
|
173
178
|
score_prop = schema["properties"]["critical_test"]
|
|
174
179
|
assert "enum" in score_prop
|
|
175
180
|
assert score_prop["enum"] == ["pass", "fail", "critical"]
|
|
181
|
+
assert score_prop["type"] == "string"
|
|
176
182
|
assert "'pass', 'fail', or 'critical'" in score_prop["description"]
|
|
177
183
|
|
|
178
184
|
assert schema["properties"]["overall_rating"] is not None
|
|
@@ -245,7 +251,7 @@ class EvalTester(BaseEval):
|
|
|
245
251
|
"""Test implementation of BaseEval"""
|
|
246
252
|
|
|
247
253
|
async def run_eval(self, task_run):
|
|
248
|
-
return {"overall_rating": 5, "quality": 4}
|
|
254
|
+
return {"overall_rating": 5, "quality": 4}, None
|
|
249
255
|
|
|
250
256
|
|
|
251
257
|
@pytest.mark.paid
|
|
@@ -265,14 +271,8 @@ async def test_run_method():
|
|
|
265
271
|
|
|
266
272
|
eval_config = EvalConfig(
|
|
267
273
|
name="Test Eval Config",
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
properties={
|
|
271
|
-
"model_name": "gpt-4o",
|
|
272
|
-
"model_provider": "openai",
|
|
273
|
-
"adapter_name": "test",
|
|
274
|
-
},
|
|
275
|
-
),
|
|
274
|
+
model_name="gpt-4o",
|
|
275
|
+
model_provider="openai",
|
|
276
276
|
parent=Eval(
|
|
277
277
|
name="Test Eval",
|
|
278
278
|
parent=task,
|
|
@@ -291,10 +291,6 @@ async def test_run_method():
|
|
|
291
291
|
),
|
|
292
292
|
],
|
|
293
293
|
),
|
|
294
|
-
prompt=BasePrompt(
|
|
295
|
-
name="Test Prompt",
|
|
296
|
-
prompt="Test prompt",
|
|
297
|
-
),
|
|
298
294
|
properties={"eval_steps": ["test_step"]},
|
|
299
295
|
)
|
|
300
296
|
|
|
@@ -311,7 +307,9 @@ async def test_run_method():
|
|
|
311
307
|
evaluator = EvalTester(eval_config, run_config.run_config())
|
|
312
308
|
|
|
313
309
|
# Run the evaluation
|
|
314
|
-
task_run, eval_scores = await evaluator.
|
|
310
|
+
task_run, eval_scores, intermediate_outputs = await evaluator.run_task_and_eval(
|
|
311
|
+
"test input"
|
|
312
|
+
)
|
|
315
313
|
|
|
316
314
|
# Verify task run was created
|
|
317
315
|
assert task_run.input == "test input"
|
|
@@ -323,3 +321,162 @@ async def test_run_method():
|
|
|
323
321
|
|
|
324
322
|
# Verify schema validation worked (these keys should exist per schema)
|
|
325
323
|
assert set(eval_scores.keys()) == {"overall_rating", "quality"}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@pytest.mark.asyncio
|
|
327
|
+
async def test_run_task_and_eval():
|
|
328
|
+
"""Test run_task_and_eval method with mocked dependencies"""
|
|
329
|
+
# Create test data
|
|
330
|
+
task = Task(
|
|
331
|
+
name="Test Task",
|
|
332
|
+
instruction="Test instruction",
|
|
333
|
+
requirements=[
|
|
334
|
+
TaskRequirement(
|
|
335
|
+
name="Quality",
|
|
336
|
+
instruction="Rate quality",
|
|
337
|
+
type=TaskOutputRatingType.five_star,
|
|
338
|
+
),
|
|
339
|
+
],
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
eval_config = EvalConfig(
|
|
343
|
+
name="Test Eval Config",
|
|
344
|
+
model_name="gpt-4o",
|
|
345
|
+
model_provider="openai",
|
|
346
|
+
parent=Eval(
|
|
347
|
+
name="Test Eval",
|
|
348
|
+
parent=task,
|
|
349
|
+
eval_set_filter_id="all",
|
|
350
|
+
eval_configs_filter_id="all",
|
|
351
|
+
output_scores=[
|
|
352
|
+
EvalOutputScore(
|
|
353
|
+
name="Quality",
|
|
354
|
+
instruction="Rate quality",
|
|
355
|
+
type=TaskOutputRatingType.five_star,
|
|
356
|
+
),
|
|
357
|
+
EvalOutputScore(
|
|
358
|
+
name="Overall Rating",
|
|
359
|
+
instruction="The overall rating for the task output",
|
|
360
|
+
type=TaskOutputRatingType.five_star,
|
|
361
|
+
),
|
|
362
|
+
],
|
|
363
|
+
),
|
|
364
|
+
properties={"eval_steps": ["test_step"]},
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
run_config = TaskRunConfig(
|
|
368
|
+
name="Test Run Config",
|
|
369
|
+
run_config_properties=RunConfigProperties(
|
|
370
|
+
model_name="llama_3_1_8b",
|
|
371
|
+
model_provider_name="groq",
|
|
372
|
+
prompt_id="simple_prompt_builder",
|
|
373
|
+
structured_output_mode="json_schema",
|
|
374
|
+
),
|
|
375
|
+
parent=task,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Create evaluator instance
|
|
379
|
+
class MockEval(BaseEval):
|
|
380
|
+
async def run_eval(self, task_run):
|
|
381
|
+
return {"overall_rating": 5, "quality": 4}, {"thinking": "test thinking"}
|
|
382
|
+
|
|
383
|
+
evaluator = MockEval(eval_config, run_config.run_config())
|
|
384
|
+
|
|
385
|
+
# Mock dependencies
|
|
386
|
+
mock_adapter = AsyncMock()
|
|
387
|
+
mock_task_run = MagicMock()
|
|
388
|
+
mock_task_run.input = "test input"
|
|
389
|
+
mock_task_run.output.output = "test output"
|
|
390
|
+
mock_adapter.invoke.return_value = mock_task_run
|
|
391
|
+
|
|
392
|
+
with (
|
|
393
|
+
patch(
|
|
394
|
+
"kiln_ai.adapters.eval.base_eval.adapter_for_task"
|
|
395
|
+
) as mock_adapter_for_task,
|
|
396
|
+
patch(
|
|
397
|
+
"kiln_ai.adapters.eval.base_eval.validate_schema_with_value_error"
|
|
398
|
+
) as mock_validate,
|
|
399
|
+
):
|
|
400
|
+
mock_adapter_for_task.return_value = mock_adapter
|
|
401
|
+
|
|
402
|
+
# Test with string input
|
|
403
|
+
result = await evaluator.run_task_and_eval("test input")
|
|
404
|
+
|
|
405
|
+
# Verify adapter_for_task was called with correct parameters
|
|
406
|
+
mock_adapter_for_task.assert_called_once()
|
|
407
|
+
assert mock_adapter_for_task.call_args[0][0] == evaluator.target_task
|
|
408
|
+
props = mock_adapter_for_task.call_args[0][1]
|
|
409
|
+
assert props.model_name == "llama_3_1_8b"
|
|
410
|
+
assert props.model_provider_name == "groq"
|
|
411
|
+
assert props.prompt_id == "simple_prompt_builder"
|
|
412
|
+
bac = mock_adapter_for_task.call_args[1]
|
|
413
|
+
assert bac["base_adapter_config"].allow_saving is False
|
|
414
|
+
|
|
415
|
+
# Verify the base_adapter_config has allow_saving=False
|
|
416
|
+
adapter_config = mock_adapter_for_task.call_args[1]["base_adapter_config"]
|
|
417
|
+
assert adapter_config.allow_saving is False
|
|
418
|
+
|
|
419
|
+
# Verify adapter.invoke was called with correct input
|
|
420
|
+
mock_adapter.invoke.assert_called_once_with("test input")
|
|
421
|
+
|
|
422
|
+
# Verify validate_schema_with_value_error was called
|
|
423
|
+
mock_validate.assert_called_once_with(
|
|
424
|
+
{"overall_rating": 5, "quality": 4},
|
|
425
|
+
evaluator.score_schema,
|
|
426
|
+
"Eval output does not match score schema.",
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Verify return values
|
|
430
|
+
task_run, eval_scores, intermediate_outputs = result
|
|
431
|
+
assert task_run == mock_task_run
|
|
432
|
+
assert eval_scores == {"overall_rating": 5, "quality": 4}
|
|
433
|
+
assert intermediate_outputs == {"thinking": "test thinking"}
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
@pytest.mark.asyncio
|
|
437
|
+
async def test_run_task_and_eval_no_run_config():
|
|
438
|
+
"""Test run_task_and_eval raises error when run_config is None"""
|
|
439
|
+
task = Task(
|
|
440
|
+
name="Test Task",
|
|
441
|
+
instruction="Test instruction",
|
|
442
|
+
requirements=[
|
|
443
|
+
TaskRequirement(
|
|
444
|
+
name="Quality",
|
|
445
|
+
instruction="Rate quality",
|
|
446
|
+
type=TaskOutputRatingType.five_star,
|
|
447
|
+
),
|
|
448
|
+
],
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
eval_config = EvalConfig(
|
|
452
|
+
name="Test Eval Config",
|
|
453
|
+
model_name="gpt-4o",
|
|
454
|
+
model_provider="openai",
|
|
455
|
+
parent=Eval(
|
|
456
|
+
name="Test Eval",
|
|
457
|
+
parent=task,
|
|
458
|
+
eval_set_filter_id="all",
|
|
459
|
+
eval_configs_filter_id="all",
|
|
460
|
+
output_scores=[
|
|
461
|
+
EvalOutputScore(
|
|
462
|
+
name="Quality",
|
|
463
|
+
instruction="Rate quality",
|
|
464
|
+
type=TaskOutputRatingType.five_star,
|
|
465
|
+
),
|
|
466
|
+
],
|
|
467
|
+
),
|
|
468
|
+
properties={"eval_steps": ["test_step"]},
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Create evaluator instance with no run_config
|
|
472
|
+
class MockEval(BaseEval):
|
|
473
|
+
async def run_eval(self, task_run):
|
|
474
|
+
return {"quality": 4}, None
|
|
475
|
+
|
|
476
|
+
evaluator = MockEval(eval_config, None)
|
|
477
|
+
|
|
478
|
+
# Test that it raises ValueError
|
|
479
|
+
with pytest.raises(
|
|
480
|
+
ValueError, match="Run config is required for run_task_and_eval"
|
|
481
|
+
):
|
|
482
|
+
await evaluator.run_task_and_eval("test input")
|
|
@@ -94,6 +94,7 @@ def mock_run_config(
|
|
|
94
94
|
model_name="gpt-4",
|
|
95
95
|
model_provider_name="openai",
|
|
96
96
|
prompt_id="simple_prompt_builder",
|
|
97
|
+
structured_output_mode="json_schema",
|
|
97
98
|
),
|
|
98
99
|
parent=mock_task,
|
|
99
100
|
)
|
|
@@ -209,6 +210,7 @@ def test_collect_tasks_filtering(
|
|
|
209
210
|
model_name="gpt-4",
|
|
210
211
|
model_provider_name="openai",
|
|
211
212
|
prompt_id="simple_prompt_builder",
|
|
213
|
+
structured_output_mode="json_schema",
|
|
212
214
|
),
|
|
213
215
|
parent=mock_task,
|
|
214
216
|
)
|
|
@@ -416,6 +418,7 @@ def test_collect_tasks_multiple_run_configs(
|
|
|
416
418
|
model_name="gpt-3.5",
|
|
417
419
|
model_provider_name="openai",
|
|
418
420
|
prompt_id="simple_prompt_builder",
|
|
421
|
+
structured_output_mode="json_schema",
|
|
419
422
|
),
|
|
420
423
|
parent=mock_task,
|
|
421
424
|
)
|
|
@@ -99,6 +99,7 @@ def test_run_config(test_task):
|
|
|
99
99
|
model_provider_name="groq",
|
|
100
100
|
task=test_task,
|
|
101
101
|
prompt_id="simple_prompt_builder",
|
|
102
|
+
structured_output_mode="json_schema",
|
|
102
103
|
)
|
|
103
104
|
|
|
104
105
|
|
|
@@ -273,6 +274,36 @@ def test_token_case():
|
|
|
273
274
|
assert token.lower() == token
|
|
274
275
|
|
|
275
276
|
|
|
277
|
+
def test_generate_run_description(test_eval_config, test_run_config, test_task_run):
|
|
278
|
+
"""Test that generate_run_description correctly uses task_run.output.output (the string) rather than task_run.output (the object)."""
|
|
279
|
+
# Create G-Eval instance
|
|
280
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
281
|
+
|
|
282
|
+
# Call generate_run_description
|
|
283
|
+
description = g_eval.generate_run_description(
|
|
284
|
+
test_task_run.input, test_task_run.output.output
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Verify that the actual string output is in the description
|
|
288
|
+
expected_output = "Why did the chicken cross the road? To get to the other side!"
|
|
289
|
+
assert expected_output in description
|
|
290
|
+
|
|
291
|
+
# Verify that the input is also in the description
|
|
292
|
+
assert "Tell me a chicken joke" in description
|
|
293
|
+
|
|
294
|
+
# Verify the description has the expected structure
|
|
295
|
+
assert "<eval_data>" in description
|
|
296
|
+
assert description.count("<eval_data>") == 2 # 2 opening tags
|
|
297
|
+
assert description.count("</eval_data>") == 2 # 2 closing tags
|
|
298
|
+
assert "The model was given the following input for the task:" in description
|
|
299
|
+
assert "The model produced the following output for the task:" in description
|
|
300
|
+
|
|
301
|
+
# Verify that we're getting the actual string value, not a Python object representation
|
|
302
|
+
# The string should not contain 'TaskOutput' or other object indicators
|
|
303
|
+
assert "TaskOutput" not in description
|
|
304
|
+
assert "output=" not in description # Would appear if object __repr__ was used
|
|
305
|
+
|
|
306
|
+
|
|
276
307
|
def test_metric_offsets_and_search_ranges(
|
|
277
308
|
test_eval_config, test_run_config, test_task_run
|
|
278
309
|
):
|
|
@@ -400,7 +431,7 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
|
|
|
400
431
|
|
|
401
432
|
# Test single token case
|
|
402
433
|
token_logprob = MockTokenLogprob("5", [("5", 0.0)], logprob=1e-8) # log(1) = 0
|
|
403
|
-
score = g_eval.rating_token_to_score(token_logprob)
|
|
434
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
404
435
|
assert score == 5.0
|
|
405
436
|
|
|
406
437
|
# Test weighted average case
|
|
@@ -412,20 +443,62 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
|
|
|
412
443
|
],
|
|
413
444
|
logprob=math.log(0.6),
|
|
414
445
|
)
|
|
415
|
-
score = g_eval.rating_token_to_score(token_logprob)
|
|
446
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
416
447
|
assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4)
|
|
417
448
|
|
|
418
449
|
# Test invalid token
|
|
419
450
|
token_logprob = MockTokenLogprob(":", [(":", 0.0)], logprob=1e-8)
|
|
420
|
-
assert g_eval.rating_token_to_score(token_logprob) is None
|
|
451
|
+
assert g_eval.rating_token_to_score(token_logprob) is None # type: ignore
|
|
421
452
|
|
|
422
453
|
# Test missing from top logprobs
|
|
423
454
|
token_logprob = MockTokenLogprob("5", [], logprob=1e-8)
|
|
424
|
-
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
|
|
455
|
+
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
|
|
425
456
|
|
|
426
457
|
# Test missing from top logprobs, with special case logprob
|
|
427
458
|
token_logprob = MockTokenLogprob("5", [], logprob=-9999)
|
|
428
|
-
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
|
|
459
|
+
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def test_rating_token_to_score_zero_score_bug_fix(test_eval_config, test_run_config):
|
|
463
|
+
"""Test that rating_token_to_score correctly handles 0.0 scores (like 'fail') and doesn't return None.
|
|
464
|
+
|
|
465
|
+
This test verifies the fix for the bug where 'if not primary_token_score:' would incorrectly
|
|
466
|
+
treat 0.0 as falsy and return None, when it should only return None for actual None values.
|
|
467
|
+
"""
|
|
468
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
469
|
+
|
|
470
|
+
class MockTopLogprob:
|
|
471
|
+
def __init__(self, token, logprob):
|
|
472
|
+
self.token = token
|
|
473
|
+
self.logprob = logprob
|
|
474
|
+
|
|
475
|
+
class MockTokenLogprob:
|
|
476
|
+
def __init__(self, token, top_logprobs, logprob):
|
|
477
|
+
self.token = token
|
|
478
|
+
self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs]
|
|
479
|
+
self.logprob = logprob
|
|
480
|
+
|
|
481
|
+
# Test that "fail" token (which maps to 0.0) is handled correctly
|
|
482
|
+
token_logprob = MockTokenLogprob("fail", [("fail", 0.0)], logprob=1e-8)
|
|
483
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
484
|
+
assert score == 0.0, f"Expected 0.0 for 'fail' token, got {score}"
|
|
485
|
+
|
|
486
|
+
# Test that "0" token (which maps to None) still returns None
|
|
487
|
+
token_logprob = MockTokenLogprob("0", [("0", 0.0)], logprob=1e-8)
|
|
488
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
489
|
+
assert score is None, f"Expected None for '0' token, got {score}"
|
|
490
|
+
|
|
491
|
+
# Test weighted average case with fail token
|
|
492
|
+
token_logprob = MockTokenLogprob(
|
|
493
|
+
"fail",
|
|
494
|
+
[
|
|
495
|
+
("fail", math.log(0.7)), # 70% probability for fail (0.0)
|
|
496
|
+
("pass", math.log(0.3)), # 30% probability for pass (1.0)
|
|
497
|
+
],
|
|
498
|
+
logprob=math.log(0.7),
|
|
499
|
+
)
|
|
500
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
501
|
+
assert pytest.approx(score) == 0.3 # (0.0 * 0.7 + 1.0 * 0.3)
|
|
429
502
|
|
|
430
503
|
|
|
431
504
|
def test_g_eval_system_instruction():
|
|
@@ -501,3 +574,41 @@ async def test_all_built_in_models_logprobs_geval(
|
|
|
501
574
|
model_name,
|
|
502
575
|
provider_name.value,
|
|
503
576
|
)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def check_supports_llm_as_judge(model_name: str, provider_name: str):
|
|
580
|
+
for model in built_in_models:
|
|
581
|
+
if model.name != model_name:
|
|
582
|
+
continue
|
|
583
|
+
for provider in model.providers:
|
|
584
|
+
if provider.name != provider_name:
|
|
585
|
+
continue
|
|
586
|
+
if not provider.supports_structured_output:
|
|
587
|
+
pytest.skip(
|
|
588
|
+
f"Skipping {model.name} {provider.name} because it does not support llm_as_judge (structured_output_mode)"
|
|
589
|
+
)
|
|
590
|
+
return
|
|
591
|
+
raise RuntimeError(f"No model {model_name} {provider_name} found")
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@pytest.mark.paid
|
|
595
|
+
@pytest.mark.ollama
|
|
596
|
+
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
597
|
+
async def test_all_built_in_models_llm_as_judge(
|
|
598
|
+
model_name,
|
|
599
|
+
provider_name,
|
|
600
|
+
test_task,
|
|
601
|
+
test_eval_config,
|
|
602
|
+
test_task_run,
|
|
603
|
+
test_run_config,
|
|
604
|
+
):
|
|
605
|
+
check_supports_llm_as_judge(model_name, provider_name)
|
|
606
|
+
await run_g_eval_test(
|
|
607
|
+
test_task,
|
|
608
|
+
test_eval_config,
|
|
609
|
+
test_task_run,
|
|
610
|
+
EvalConfigType.llm_as_judge,
|
|
611
|
+
test_run_config,
|
|
612
|
+
model_name,
|
|
613
|
+
provider_name.value,
|
|
614
|
+
)
|
|
@@ -3,14 +3,9 @@ from typing import Literal
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from kiln_ai.
|
|
7
|
-
from kiln_ai.datamodel import (
|
|
8
|
-
DatasetSplit,
|
|
9
|
-
FinetuneDataStrategy,
|
|
10
|
-
FineTuneStatusType,
|
|
11
|
-
Task,
|
|
12
|
-
)
|
|
6
|
+
from kiln_ai.datamodel import DatasetSplit, FineTuneStatusType, Task
|
|
13
7
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
8
|
+
from kiln_ai.datamodel.datamodel_enums import ChatStrategy
|
|
14
9
|
from kiln_ai.utils.name_generator import generate_memorable_name
|
|
15
10
|
|
|
16
11
|
|
|
@@ -62,7 +57,7 @@ class BaseFinetuneAdapter(ABC):
|
|
|
62
57
|
train_split_name: str,
|
|
63
58
|
system_message: str,
|
|
64
59
|
thinking_instructions: str | None,
|
|
65
|
-
data_strategy:
|
|
60
|
+
data_strategy: ChatStrategy,
|
|
66
61
|
parameters: dict[str, str | int | float | bool] = {},
|
|
67
62
|
name: str | None = None,
|
|
68
63
|
description: str | None = None,
|