kiln-ai 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kiln_ai/adapters/chat/chat_formatter.py +0 -1
  2. kiln_ai/adapters/data_gen/data_gen_prompts.py +121 -36
  3. kiln_ai/adapters/data_gen/data_gen_task.py +49 -36
  4. kiln_ai/adapters/data_gen/test_data_gen_task.py +311 -34
  5. kiln_ai/adapters/eval/base_eval.py +6 -7
  6. kiln_ai/adapters/eval/eval_runner.py +5 -1
  7. kiln_ai/adapters/eval/g_eval.py +17 -12
  8. kiln_ai/adapters/eval/test_base_eval.py +8 -2
  9. kiln_ai/adapters/eval/test_g_eval.py +115 -5
  10. kiln_ai/adapters/fine_tune/base_finetune.py +1 -6
  11. kiln_ai/adapters/fine_tune/dataset_formatter.py +1 -5
  12. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +1 -1
  13. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +2 -7
  14. kiln_ai/adapters/fine_tune/together_finetune.py +1 -1
  15. kiln_ai/adapters/ml_model_list.py +293 -44
  16. kiln_ai/adapters/model_adapters/litellm_adapter.py +9 -0
  17. kiln_ai/adapters/model_adapters/test_base_adapter.py +0 -1
  18. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +48 -0
  19. kiln_ai/adapters/model_adapters/test_structured_output.py +3 -3
  20. kiln_ai/adapters/parsers/parser_registry.py +0 -2
  21. kiln_ai/adapters/parsers/r1_parser.py +0 -1
  22. kiln_ai/adapters/remote_config.py +66 -0
  23. kiln_ai/adapters/repair/repair_task.py +1 -6
  24. kiln_ai/adapters/test_ml_model_list.py +18 -0
  25. kiln_ai/adapters/test_prompt_adaptors.py +0 -4
  26. kiln_ai/adapters/test_remote_config.py +100 -0
  27. kiln_ai/datamodel/eval.py +32 -0
  28. kiln_ai/datamodel/finetune.py +0 -1
  29. kiln_ai/datamodel/task_output.py +0 -2
  30. kiln_ai/datamodel/task_run.py +0 -2
  31. kiln_ai/datamodel/test_eval_model.py +146 -4
  32. kiln_ai/utils/logging.py +4 -3
  33. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/METADATA +2 -2
  34. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/RECORD +36 -34
  35. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/WHEEL +0 -0
  36. {kiln_ai-0.17.0.dist-info → kiln_ai-0.18.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -274,6 +274,36 @@ def test_token_case():
274
274
  assert token.lower() == token
275
275
 
276
276
 
277
+ def test_generate_run_description(test_eval_config, test_run_config, test_task_run):
278
+ """Test that generate_run_description correctly uses task_run.output.output (the string) rather than task_run.output (the object)."""
279
+ # Create G-Eval instance
280
+ g_eval = GEval(test_eval_config, test_run_config)
281
+
282
+ # Call generate_run_description
283
+ description = g_eval.generate_run_description(
284
+ test_task_run.input, test_task_run.output.output
285
+ )
286
+
287
+ # Verify that the actual string output is in the description
288
+ expected_output = "Why did the chicken cross the road? To get to the other side!"
289
+ assert expected_output in description
290
+
291
+ # Verify that the input is also in the description
292
+ assert "Tell me a chicken joke" in description
293
+
294
+ # Verify the description has the expected structure
295
+ assert "<eval_data>" in description
296
+ assert description.count("<eval_data>") == 2 # 2 opening tags
297
+ assert description.count("</eval_data>") == 2 # 2 closing tags
298
+ assert "The model was given the following input for the task:" in description
299
+ assert "The model produced the following output for the task:" in description
300
+
301
+ # Verify that we're getting the actual string value, not a Python object representation
302
+ # The string should not contain 'TaskOutput' or other object indicators
303
+ assert "TaskOutput" not in description
304
+ assert "output=" not in description # Would appear if object __repr__ was used
305
+
306
+
277
307
  def test_metric_offsets_and_search_ranges(
278
308
  test_eval_config, test_run_config, test_task_run
279
309
  ):
@@ -401,7 +431,7 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
401
431
 
402
432
  # Test single token case
403
433
  token_logprob = MockTokenLogprob("5", [("5", 0.0)], logprob=1e-8) # log(1) = 0
404
- score = g_eval.rating_token_to_score(token_logprob)
434
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
405
435
  assert score == 5.0
406
436
 
407
437
  # Test weighted average case
@@ -413,20 +443,62 @@ def test_rating_token_to_score(test_eval_config, test_run_config):
413
443
  ],
414
444
  logprob=math.log(0.6),
415
445
  )
416
- score = g_eval.rating_token_to_score(token_logprob)
446
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
417
447
  assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4)
418
448
 
419
449
  # Test invalid token
420
450
  token_logprob = MockTokenLogprob(":", [(":", 0.0)], logprob=1e-8)
421
- assert g_eval.rating_token_to_score(token_logprob) is None
451
+ assert g_eval.rating_token_to_score(token_logprob) is None # type: ignore
422
452
 
423
453
  # Test missing from top logprobs
424
454
  token_logprob = MockTokenLogprob("5", [], logprob=1e-8)
425
- assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
455
+ assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
426
456
 
427
457
  # Test missing from top logprobs, with special case logprob
428
458
  token_logprob = MockTokenLogprob("5", [], logprob=-9999)
429
- assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0
459
+ assert pytest.approx(g_eval.rating_token_to_score(token_logprob)) == 5.0 # type: ignore
460
+
461
+
462
+ def test_rating_token_to_score_zero_score_bug_fix(test_eval_config, test_run_config):
463
+ """Test that rating_token_to_score correctly handles 0.0 scores (like 'fail') and doesn't return None.
464
+
465
+ This test verifies the fix for the bug where 'if not primary_token_score:' would incorrectly
466
+ treat 0.0 as falsy and return None, when it should only return None for actual None values.
467
+ """
468
+ g_eval = GEval(test_eval_config, test_run_config)
469
+
470
+ class MockTopLogprob:
471
+ def __init__(self, token, logprob):
472
+ self.token = token
473
+ self.logprob = logprob
474
+
475
+ class MockTokenLogprob:
476
+ def __init__(self, token, top_logprobs, logprob):
477
+ self.token = token
478
+ self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs]
479
+ self.logprob = logprob
480
+
481
+ # Test that "fail" token (which maps to 0.0) is handled correctly
482
+ token_logprob = MockTokenLogprob("fail", [("fail", 0.0)], logprob=1e-8)
483
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
484
+ assert score == 0.0, f"Expected 0.0 for 'fail' token, got {score}"
485
+
486
+ # Test that "0" token (which maps to None) still returns None
487
+ token_logprob = MockTokenLogprob("0", [("0", 0.0)], logprob=1e-8)
488
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
489
+ assert score is None, f"Expected None for '0' token, got {score}"
490
+
491
+ # Test weighted average case with fail token
492
+ token_logprob = MockTokenLogprob(
493
+ "fail",
494
+ [
495
+ ("fail", math.log(0.7)), # 70% probability for fail (0.0)
496
+ ("pass", math.log(0.3)), # 30% probability for pass (1.0)
497
+ ],
498
+ logprob=math.log(0.7),
499
+ )
500
+ score = g_eval.rating_token_to_score(token_logprob) # type: ignore
501
+ assert pytest.approx(score) == 0.3 # (0.0 * 0.7 + 1.0 * 0.3)
430
502
 
431
503
 
432
504
  def test_g_eval_system_instruction():
@@ -502,3 +574,41 @@ async def test_all_built_in_models_logprobs_geval(
502
574
  model_name,
503
575
  provider_name.value,
504
576
  )
577
+
578
+
579
+ def check_supports_llm_as_judge(model_name: str, provider_name: str):
580
+ for model in built_in_models:
581
+ if model.name != model_name:
582
+ continue
583
+ for provider in model.providers:
584
+ if provider.name != provider_name:
585
+ continue
586
+ if not provider.supports_structured_output:
587
+ pytest.skip(
588
+ f"Skipping {model.name} {provider.name} because it does not support llm_as_judge (structured_output_mode)"
589
+ )
590
+ return
591
+ raise RuntimeError(f"No model {model_name} {provider_name} found")
592
+
593
+
594
+ @pytest.mark.paid
595
+ @pytest.mark.ollama
596
+ @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers())
597
+ async def test_all_built_in_models_llm_as_judge(
598
+ model_name,
599
+ provider_name,
600
+ test_task,
601
+ test_eval_config,
602
+ test_task_run,
603
+ test_run_config,
604
+ ):
605
+ check_supports_llm_as_judge(model_name, provider_name)
606
+ await run_g_eval_test(
607
+ test_task,
608
+ test_eval_config,
609
+ test_task_run,
610
+ EvalConfigType.llm_as_judge,
611
+ test_run_config,
612
+ model_name,
613
+ provider_name.value,
614
+ )
@@ -3,12 +3,7 @@ from typing import Literal
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
- from kiln_ai.adapters.ml_model_list import built_in_models
7
- from kiln_ai.datamodel import (
8
- DatasetSplit,
9
- FineTuneStatusType,
10
- Task,
11
- )
6
+ from kiln_ai.datamodel import DatasetSplit, FineTuneStatusType, Task
12
7
  from kiln_ai.datamodel import Finetune as FinetuneModel
13
8
  from kiln_ai.datamodel.datamodel_enums import ChatStrategy
14
9
  from kiln_ai.utils.name_generator import generate_memorable_name
@@ -1,15 +1,11 @@
1
1
  import json
2
2
  import tempfile
3
- from dataclasses import dataclass
4
3
  from enum import Enum
5
4
  from pathlib import Path
6
5
  from typing import Any, Dict, Protocol
7
6
  from uuid import uuid4
8
7
 
9
- from kiln_ai.adapters.chat.chat_formatter import (
10
- ChatMessage,
11
- get_chat_formatter,
12
- )
8
+ from kiln_ai.adapters.chat.chat_formatter import ChatMessage, get_chat_formatter
13
9
  from kiln_ai.datamodel import DatasetSplit, TaskRun
14
10
  from kiln_ai.datamodel.datamodel_enums import THINKING_DATA_STRATEGIES, ChatStrategy
15
11
  from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
@@ -857,7 +857,7 @@ def test_serialize_r1_style_message_missing_thinking(thinking, final_output):
857
857
 
858
858
  def test_vertex_gemini_role_map_coverage():
859
859
  """Test that VERTEX_GEMINI_ROLE_MAP covers all possible ChatMessage.role values"""
860
- from typing import Literal, get_type_hints
860
+ from typing import get_type_hints
861
861
 
862
862
  # Get the Literal type from ChatMessage.role
863
863
  role_type = get_type_hints(ChatMessage)["role"]
@@ -1,6 +1,5 @@
1
- import time
2
1
  from pathlib import Path
3
- from unittest.mock import AsyncMock, MagicMock, patch
2
+ from unittest.mock import MagicMock, patch
4
3
 
5
4
  import pytest
6
5
  from google.cloud import storage
@@ -10,11 +9,7 @@ from vertexai.tuning import sft
10
9
  from kiln_ai.adapters.fine_tune.base_finetune import FineTuneStatusType
11
10
  from kiln_ai.adapters.fine_tune.dataset_formatter import DatasetFormat, DatasetFormatter
12
11
  from kiln_ai.adapters.fine_tune.vertex_finetune import VertexFinetune
13
- from kiln_ai.datamodel import (
14
- DatasetSplit,
15
- StructuredOutputMode,
16
- Task,
17
- )
12
+ from kiln_ai.datamodel import DatasetSplit, StructuredOutputMode, Task
18
13
  from kiln_ai.datamodel import Finetune as FinetuneModel
19
14
  from kiln_ai.datamodel.datamodel_enums import ChatStrategy
20
15
  from kiln_ai.datamodel.dataset_split import Train80Test20SplitDefinition
@@ -1,4 +1,4 @@
1
- from typing import Literal, Tuple
1
+ from typing import Tuple
2
2
 
3
3
  from together import Together
4
4
  from together.types.files import FilePurpose