kiln-ai 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiln_ai/adapters/chat/chat_formatter.py +0 -1
- kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
- kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
- kiln_ai/adapters/data_gen/test_data_gen_task.py +311 -34
- kiln_ai/adapters/eval/base_eval.py +6 -7
- kiln_ai/adapters/eval/eval_runner.py +5 -1
- kiln_ai/adapters/eval/g_eval.py +17 -12
- kiln_ai/adapters/eval/test_base_eval.py +8 -2
- kiln_ai/adapters/eval/test_g_eval.py +115 -5
- kiln_ai/adapters/fine_tune/base_finetune.py +1 -6
- kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
- kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
- kiln_ai/adapters/ml_model_list.py +293 -44
- kiln_ai/adapters/model_adapters/litellm_adapter.py +9 -0
- kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -1
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +48 -0
- kiln_ai/adapters/model_adapters/test_structured_output.py +3 -3
- kiln_ai/adapters/parsers/parser_registry.py +0 -2
- kiln_ai/adapters/parsers/r1_parser.py +0 -1
- kiln_ai/adapters/remote_config.py +66 -0
- kiln_ai/adapters/repair/repair_task.py +1 -6
- kiln_ai/adapters/test_ml_model_list.py +18 -0
- kiln_ai/adapters/test_prompt_adaptors.py +0 -4
- kiln_ai/adapters/test_remote_config.py +100 -0
- kiln_ai/datamodel/eval.py +32 -0
- kiln_ai/datamodel/finetune.py +0 -1
- kiln_ai/datamodel/task_output.py +0 -2
- kiln_ai/datamodel/task_run.py +0 -2
- kiln_ai/datamodel/test_eval_model.py +146 -4
- kiln_ai/utils/logging.py +4 -3
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/RECORD +36 -34
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -274,6 +274,36 @@ def test_token_case():
|
|
|
274
274
|
assert token.lower() == token
|
|
275
275
|
|
|
276
276
|
|
|
277
|
+
def test_generate_run_description(test_eval_config, test_run_config, test_task_run):
|
|
278
|
+
"""Test that generate_run_description correctly uses task_run.output.output (the string) rather than task_run.output (the object)."""
|
|
279
|
+
# Create G-Eval instance
|
|
280
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
281
|
+
|
|
282
|
+
# Call generate_run_description
|
|
283
|
+
description = g_eval.generate_run_description(
|
|
284
|
+
test_task_run.input, test_task_run.output.output
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Verify that the actual string output is in the description
|
|
288
|
+
expected_output = "Why did the chicken cross the road? To get to the other side!"
|
|
289
|
+
assert expected_output in description
|
|
290
|
+
|
|
291
|
+
# Verify that the input is also in the description
|
|
292
|
+
assert "Tell me a chicken joke" in description
|
|
293
|
+
|
|
294
|
+
# Verify the description has the expected structure
|
|
295
|
+
assert "<eval_data>" in description
|
|
296
|
+
assert description.count("<eval_data>") == 2 # 2 opening tags
|
|
297
|
+
assert description.count("</eval_data>") == 2 # 2 closing tags
|
|
298
|
+
assert "The model was given the following input for the task:" in description
|
|
299
|
+
assert "The model produced the following output for the task:" in description
|
|
300
|
+
|
|
301
|
+
# Verify that we're getting the actual string value, not a Python object representation
|
|
302
|
+
# The string should not contain 'TaskOutput' or other object indicators
|
|
303
|
+
assert "TaskOutput" not in description
|
|
304
|
+
assert "output=" not in description # Would appear if object __repr__ was used
|
|
305
|
+
|
|
306
|
+
|
|
277
307
|
def test_metric_offsets_and_search_ranges(
|
|
278
308
|
test_eval_config, test_run_config, test_task_run
|
|
279
309
|
):
|
|
@@ -401,7 +431,7 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
|
|
|
401
431
|
|
|
402
432
|
# Test single token case
|
|
403
433
|
token_logprob = MockTokenLogprob("5", [("5", 0.0)], logprob=1e-8) # log(1) = 0
|
|
404
|
-
score = g_eval.rating_token_to_score(token_logprob)
|
|
434
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
405
435
|
assert score == 5.0
|
|
406
436
|
|
|
407
437
|
# Test weighted average case
|
|
@@ -413,20 +443,62 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
|
|
|
413
443
|
],
|
|
414
444
|
logprob=math.log(0.6),
|
|
415
445
|
)
|
|
416
|
-
score = g_eval.rating_token_to_score(token_logprob)
|
|
446
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
417
447
|
assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4)
|
|
418
448
|
|
|
419
449
|
# Test invalid token
|
|
420
450
|
token_logprob = MockTokenLogprob(":", [(":", 0.0)], logprob=1e-8)
|
|
421
|
-
assert g_eval.rating_token_to_score(token_logprob) is None
|
|
451
|
+
assert g_eval.rating_token_to_score(token_logprob) is None # type: ignore
|
|
422
452
|
|
|
423
453
|
# Test missing from top logprobs
|
|
424
454
|
token_logprob = MockTokenLogprob("5", [], logprob=1e-8)
|
|
425
|
-
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
|
|
455
|
+
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
|
|
426
456
|
|
|
427
457
|
# Test missing from top logprobs, with special case logprob
|
|
428
458
|
token_logprob = MockTokenLogprob("5", [], logprob=-9999)
|
|
429
|
-
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
|
|
459
|
+
assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def test_rating_token_to_score_zero_score_bug_fix(test_eval_config, test_run_config):
|
|
463
|
+
"""Test that rating_token_to_score correctly handles 0.0 scores (like 'fail') and doesn't return None.
|
|
464
|
+
|
|
465
|
+
This test verifies the fix for the bug where 'if not primary_token_score:' would incorrectly
|
|
466
|
+
treat 0.0 as falsy and return None, when it should only return None for actual None values.
|
|
467
|
+
"""
|
|
468
|
+
g_eval = GEval(test_eval_config, test_run_config)
|
|
469
|
+
|
|
470
|
+
class MockTopLogprob:
|
|
471
|
+
def __init__(self, token, logprob):
|
|
472
|
+
self.token = token
|
|
473
|
+
self.logprob = logprob
|
|
474
|
+
|
|
475
|
+
class MockTokenLogprob:
|
|
476
|
+
def __init__(self, token, top_logprobs, logprob):
|
|
477
|
+
self.token = token
|
|
478
|
+
self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs]
|
|
479
|
+
self.logprob = logprob
|
|
480
|
+
|
|
481
|
+
# Test that "fail" token (which maps to 0.0) is handled correctly
|
|
482
|
+
token_logprob = MockTokenLogprob("fail", [("fail", 0.0)], logprob=1e-8)
|
|
483
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
484
|
+
assert score == 0.0, f"Expected 0.0 for 'fail' token, got {score}"
|
|
485
|
+
|
|
486
|
+
# Test that "0" token (which maps to None) still returns None
|
|
487
|
+
token_logprob = MockTokenLogprob("0", [("0", 0.0)], logprob=1e-8)
|
|
488
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
489
|
+
assert score is None, f"Expected None for '0' token, got {score}"
|
|
490
|
+
|
|
491
|
+
# Test weighted average case with fail token
|
|
492
|
+
token_logprob = MockTokenLogprob(
|
|
493
|
+
"fail",
|
|
494
|
+
[
|
|
495
|
+
("fail", math.log(0.7)), # 70% probability for fail (0.0)
|
|
496
|
+
("pass", math.log(0.3)), # 30% probability for pass (1.0)
|
|
497
|
+
],
|
|
498
|
+
logprob=math.log(0.7),
|
|
499
|
+
)
|
|
500
|
+
score = g_eval.rating_token_to_score(token_logprob) # type: ignore
|
|
501
|
+
assert pytest.approx(score) == 0.3 # (0.0 * 0.7 + 1.0 * 0.3)
|
|
430
502
|
|
|
431
503
|
|
|
432
504
|
def test_g_eval_system_instruction():
|
|
@@ -502,3 +574,41 @@ async def test_all_built_in_models_logprobs_geval(
|
|
|
502
574
|
model_name,
|
|
503
575
|
provider_name.value,
|
|
504
576
|
)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def check_supports_llm_as_judge(model_name: str, provider_name: str):
|
|
580
|
+
for model in built_in_models:
|
|
581
|
+
if model.name != model_name:
|
|
582
|
+
continue
|
|
583
|
+
for provider in model.providers:
|
|
584
|
+
if provider.name != provider_name:
|
|
585
|
+
continue
|
|
586
|
+
if not provider.supports_structured_output:
|
|
587
|
+
pytest.skip(
|
|
588
|
+
f"Skipping {model.name} {provider.name} because it does not support llm_as_judge (structured_output_mode)"
|
|
589
|
+
)
|
|
590
|
+
return
|
|
591
|
+
raise RuntimeError(f"No model {model_name} {provider_name} found")
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@pytest.mark.paid
|
|
595
|
+
@pytest.mark.ollama
|
|
596
|
+
@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
|
|
597
|
+
async def test_all_built_in_models_llm_as_judge(
|
|
598
|
+
model_name,
|
|
599
|
+
provider_name,
|
|
600
|
+
test_task,
|
|
601
|
+
test_eval_config,
|
|
602
|
+
test_task_run,
|
|
603
|
+
test_run_config,
|
|
604
|
+
):
|
|
605
|
+
check_supports_llm_as_judge(model_name, provider_name)
|
|
606
|
+
await run_g_eval_test(
|
|
607
|
+
test_task,
|
|
608
|
+
test_eval_config,
|
|
609
|
+
test_task_run,
|
|
610
|
+
EvalConfigType.llm_as_judge,
|
|
611
|
+
test_run_config,
|
|
612
|
+
model_name,
|
|
613
|
+
provider_name.value,
|
|
614
|
+
)
|
|
@@ -3,12 +3,7 @@ from typing import Literal
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from kiln_ai.
|
|
7
|
-
from kiln_ai.datamodel import (
|
|
8
|
-
DatasetSplit,
|
|
9
|
-
FineTuneStatusType,
|
|
10
|
-
Task,
|
|
11
|
-
)
|
|
6
|
+
from kiln_ai.datamodel import DatasetSplit, FineTuneStatusType, Task
|
|
12
7
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
13
8
|
from kiln_ai.datamodel.datamodel_enums import ChatStrategy
|
|
14
9
|
from kiln_ai.utils.name_generator import generate_memorable_name
|
|
@@ -1,15 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import tempfile
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
from enum import Enum
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Any, Dict, Protocol
|
|
7
6
|
from uuid import uuid4
|
|
8
7
|
|
|
9
|
-
from kiln_ai.adapters.chat.chat_formatter import
|
|
10
|
-
ChatMessage,
|
|
11
|
-
get_chat_formatter,
|
|
12
|
-
)
|
|
8
|
+
from kiln_ai.adapters.chat.chat_formatter import ChatMessage, get_chat_formatter
|
|
13
9
|
from kiln_ai.datamodel import DatasetSplit, TaskRun
|
|
14
10
|
from kiln_ai.datamodel.datamodel_enums import THINKING_DATA_STRATEGIES, ChatStrategy
|
|
15
11
|
from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
|
|
@@ -857,7 +857,7 @@ def test_serialize_r1_style_message_missing_thinking(thinking, final_output):
|
|
|
857
857
|
|
|
858
858
|
def test_vertex_gemini_role_map_coverage():
|
|
859
859
|
"""Test that VERTEX_GEMINI_ROLE_MAP covers all possible ChatMessage.role values"""
|
|
860
|
-
from typing import
|
|
860
|
+
from typing import get_type_hints
|
|
861
861
|
|
|
862
862
|
# Get the Literal type from ChatMessage.role
|
|
863
863
|
role_type = get_type_hints(ChatMessage)["role"]
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from pathlib import Path
|
|
3
|
-
from unittest.mock import
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
4
3
|
|
|
5
4
|
import pytest
|
|
6
5
|
from google.cloud import storage
|
|
@@ -10,11 +9,7 @@ from vertexai.tuning import sft
|
|
|
10
9
|
from kiln_ai.adapters.fine_tune.base_finetune import FineTuneStatusType
|
|
11
10
|
from kiln_ai.adapters.fine_tune.dataset_formatter import DatasetFormat, DatasetFormatter
|
|
12
11
|
from kiln_ai.adapters.fine_tune.vertex_finetune import VertexFinetune
|
|
13
|
-
from kiln_ai.datamodel import
|
|
14
|
-
DatasetSplit,
|
|
15
|
-
StructuredOutputMode,
|
|
16
|
-
Task,
|
|
17
|
-
)
|
|
12
|
+
from kiln_ai.datamodel import DatasetSplit, StructuredOutputMode, Task
|
|
18
13
|
from kiln_ai.datamodel import Finetune as FinetuneModel
|
|
19
14
|
from kiln_ai.datamodel.datamodel_enums import ChatStrategy
|
|
20
15
|
from kiln_ai.datamodel.dataset_split import Train80Test20SplitDefinition
|