kiln-ai 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/__init__.py +2 -0
- kiln_ai/adapters/adapter_registry.py +22 -44
- kiln_ai/adapters/chat/__init__.py +8 -0
- kiln_ai/adapters/chat/chat_formatter.py +234 -0
- kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
- kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
- kiln_ai/adapters/eval/base_eval.py +8 -6
- kiln_ai/adapters/eval/eval_runner.py +9 -65
- kiln_ai/adapters/eval/g_eval.py +26 -8
- kiln_ai/adapters/eval/test_base_eval.py +166 -15
- kiln_ai/adapters/eval/test_eval_runner.py +3 -0
- kiln_ai/adapters/eval/test_g_eval.py +1 -0
- kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
- kiln_ai/adapters/fine_tune/dataset_formatter.py +153 -197
- kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
- kiln_ai/adapters/fine_tune/test_dataset_formatter.py +402 -211
- kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
- kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
- kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
- kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
- kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
- kiln_ai/adapters/ml_model_list.py +556 -45
- kiln_ai/adapters/model_adapters/base_adapter.py +100 -35
- kiln_ai/adapters/model_adapters/litellm_adapter.py +116 -100
- kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
- kiln_ai/adapters/model_adapters/test_base_adapter.py +299 -52
- kiln_ai/adapters/model_adapters/test_litellm_adapter.py +121 -22
- kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +44 -2
- kiln_ai/adapters/model_adapters/test_structured_output.py +48 -18
- kiln_ai/adapters/parsers/base_parser.py +0 -3
- kiln_ai/adapters/parsers/parser_registry.py +5 -3
- kiln_ai/adapters/parsers/r1_parser.py +17 -2
- kiln_ai/adapters/parsers/request_formatters.py +40 -0
- kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
- kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
- kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
- kiln_ai/adapters/prompt_builders.py +14 -17
- kiln_ai/adapters/provider_tools.py +39 -4
- kiln_ai/adapters/repair/test_repair_task.py +27 -5
- kiln_ai/adapters/test_adapter_registry.py +88 -28
- kiln_ai/adapters/test_ml_model_list.py +158 -0
- kiln_ai/adapters/test_prompt_adaptors.py +17 -3
- kiln_ai/adapters/test_prompt_builders.py +27 -19
- kiln_ai/adapters/test_provider_tools.py +130 -12
- kiln_ai/datamodel/__init__.py +2 -2
- kiln_ai/datamodel/datamodel_enums.py +43 -4
- kiln_ai/datamodel/dataset_filters.py +69 -1
- kiln_ai/datamodel/dataset_split.py +4 -0
- kiln_ai/datamodel/eval.py +8 -0
- kiln_ai/datamodel/finetune.py +13 -7
- kiln_ai/datamodel/prompt_id.py +1 -0
- kiln_ai/datamodel/task.py +68 -7
- kiln_ai/datamodel/task_output.py +1 -1
- kiln_ai/datamodel/task_run.py +39 -7
- kiln_ai/datamodel/test_basemodel.py +5 -8
- kiln_ai/datamodel/test_dataset_filters.py +82 -0
- kiln_ai/datamodel/test_dataset_split.py +2 -8
- kiln_ai/datamodel/test_example_models.py +54 -0
- kiln_ai/datamodel/test_models.py +80 -9
- kiln_ai/datamodel/test_task.py +168 -2
- kiln_ai/utils/async_job_runner.py +106 -0
- kiln_ai/utils/config.py +3 -2
- kiln_ai/utils/dataset_import.py +81 -19
- kiln_ai/utils/logging.py +165 -0
- kiln_ai/utils/test_async_job_runner.py +199 -0
- kiln_ai/utils/test_config.py +23 -0
- kiln_ai/utils/test_dataset_import.py +272 -10
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
- kiln_ai-0.17.0.dist-info/RECORD +113 -0
- kiln_ai-0.15.0.dist-info/RECORD +0 -104
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
- {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -5,6 +5,7 @@ import pytest
|
|
|
5
5
|
from kiln_ai.adapters.ml_model_list import (
|
|
6
6
|
KilnModel,
|
|
7
7
|
ModelName,
|
|
8
|
+
ModelParserID,
|
|
8
9
|
ModelProviderName,
|
|
9
10
|
)
|
|
10
11
|
from kiln_ai.adapters.ollama_tools import OllamaConnection
|
|
@@ -17,14 +18,20 @@ from kiln_ai.adapters.provider_tools import (
|
|
|
17
18
|
finetune_provider_model,
|
|
18
19
|
get_model_and_provider,
|
|
19
20
|
kiln_model_provider_from,
|
|
20
|
-
|
|
21
|
+
lite_llm_config_for_openai_compatible,
|
|
21
22
|
lite_llm_provider_model,
|
|
22
23
|
parse_custom_model_id,
|
|
23
24
|
provider_enabled,
|
|
24
25
|
provider_name_from_id,
|
|
25
26
|
provider_warnings,
|
|
26
27
|
)
|
|
27
|
-
from kiln_ai.datamodel import
|
|
28
|
+
from kiln_ai.datamodel import (
|
|
29
|
+
Finetune,
|
|
30
|
+
StructuredOutputMode,
|
|
31
|
+
Task,
|
|
32
|
+
)
|
|
33
|
+
from kiln_ai.datamodel.datamodel_enums import ChatStrategy
|
|
34
|
+
from kiln_ai.datamodel.task import RunConfigProperties
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
@pytest.fixture(autouse=True)
|
|
@@ -65,6 +72,31 @@ def mock_finetune():
|
|
|
65
72
|
finetune.provider = ModelProviderName.openai
|
|
66
73
|
finetune.fine_tune_model_id = "ft:gpt-3.5-turbo:custom:model-123"
|
|
67
74
|
finetune.structured_output_mode = StructuredOutputMode.json_schema
|
|
75
|
+
finetune.data_strategy = ChatStrategy.single_turn
|
|
76
|
+
mock.return_value = finetune
|
|
77
|
+
yield mock
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@pytest.fixture
|
|
81
|
+
def mock_finetune_final_and_intermediate():
|
|
82
|
+
with patch("kiln_ai.datamodel.Finetune.from_id_and_parent_path") as mock:
|
|
83
|
+
finetune = Mock(spec=Finetune)
|
|
84
|
+
finetune.provider = ModelProviderName.openai
|
|
85
|
+
finetune.fine_tune_model_id = "ft:gpt-3.5-turbo:custom:model-123"
|
|
86
|
+
finetune.structured_output_mode = StructuredOutputMode.json_schema
|
|
87
|
+
finetune.data_strategy = ChatStrategy.two_message_cot
|
|
88
|
+
mock.return_value = finetune
|
|
89
|
+
yield mock
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@pytest.fixture
|
|
93
|
+
def mock_finetune_r1_compatible():
|
|
94
|
+
with patch("kiln_ai.datamodel.Finetune.from_id_and_parent_path") as mock:
|
|
95
|
+
finetune = Mock(spec=Finetune)
|
|
96
|
+
finetune.provider = ModelProviderName.ollama
|
|
97
|
+
finetune.fine_tune_model_id = "ft:deepseek-r1:671b:custom:model-123"
|
|
98
|
+
finetune.structured_output_mode = StructuredOutputMode.json_schema
|
|
99
|
+
finetune.data_strategy = ChatStrategy.single_turn_r1_thinking
|
|
68
100
|
mock.return_value = finetune
|
|
69
101
|
yield mock
|
|
70
102
|
|
|
@@ -324,6 +356,7 @@ async def test_kiln_model_provider_from_custom_model_valid(mock_config):
|
|
|
324
356
|
assert provider.supports_data_gen is False
|
|
325
357
|
assert provider.untested_model is True
|
|
326
358
|
assert provider.model_id == "custom_model"
|
|
359
|
+
assert provider.structured_output_mode == StructuredOutputMode.json_instructions
|
|
327
360
|
|
|
328
361
|
|
|
329
362
|
@pytest.mark.asyncio
|
|
@@ -341,6 +374,7 @@ async def test_kiln_model_provider_from_custom_registry(mock_config):
|
|
|
341
374
|
assert provider.supports_data_gen is False
|
|
342
375
|
assert provider.untested_model is True
|
|
343
376
|
assert provider.model_id == "gpt-4-turbo"
|
|
377
|
+
assert provider.structured_output_mode == StructuredOutputMode.json_instructions
|
|
344
378
|
|
|
345
379
|
|
|
346
380
|
@pytest.mark.asyncio
|
|
@@ -426,6 +460,38 @@ def test_finetune_provider_model_success(mock_project, mock_task, mock_finetune)
|
|
|
426
460
|
assert provider.name == ModelProviderName.openai
|
|
427
461
|
assert provider.model_id == "ft:gpt-3.5-turbo:custom:model-123"
|
|
428
462
|
assert provider.structured_output_mode == StructuredOutputMode.json_schema
|
|
463
|
+
assert provider.reasoning_capable is False
|
|
464
|
+
assert provider.parser == None
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def test_finetune_provider_model_success_final_and_intermediate(
|
|
468
|
+
mock_project, mock_task, mock_finetune_final_and_intermediate
|
|
469
|
+
):
|
|
470
|
+
"""Test successful creation of a fine-tuned model provider"""
|
|
471
|
+
model_id = "project-123::task-456::finetune-789"
|
|
472
|
+
|
|
473
|
+
provider = finetune_provider_model(model_id)
|
|
474
|
+
|
|
475
|
+
assert provider.name == ModelProviderName.openai
|
|
476
|
+
assert provider.model_id == "ft:gpt-3.5-turbo:custom:model-123"
|
|
477
|
+
assert provider.structured_output_mode == StructuredOutputMode.json_schema
|
|
478
|
+
assert provider.reasoning_capable is False
|
|
479
|
+
assert provider.parser == None
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def test_finetune_provider_model_success_r1_compatible(
|
|
483
|
+
mock_project, mock_task, mock_finetune_r1_compatible
|
|
484
|
+
):
|
|
485
|
+
"""Test successful creation of a fine-tuned model provider"""
|
|
486
|
+
model_id = "project-123::task-456::finetune-789"
|
|
487
|
+
|
|
488
|
+
provider = finetune_provider_model(model_id)
|
|
489
|
+
|
|
490
|
+
assert provider.name == ModelProviderName.ollama
|
|
491
|
+
assert provider.model_id == "ft:deepseek-r1:671b:custom:model-123"
|
|
492
|
+
assert provider.structured_output_mode == StructuredOutputMode.json_schema
|
|
493
|
+
assert provider.reasoning_capable is True
|
|
494
|
+
assert provider.parser == ModelParserID.r1_thinking
|
|
429
495
|
|
|
430
496
|
|
|
431
497
|
def test_finetune_provider_model_invalid_id():
|
|
@@ -515,6 +581,7 @@ def test_finetune_provider_model_structured_mode(
|
|
|
515
581
|
finetune.provider = provider_name
|
|
516
582
|
finetune.fine_tune_model_id = "fireworks-model-123"
|
|
517
583
|
finetune.structured_output_mode = structured_output_mode
|
|
584
|
+
finetune.data_strategy = ChatStrategy.single_turn
|
|
518
585
|
mock_finetune.return_value = finetune
|
|
519
586
|
|
|
520
587
|
provider = finetune_provider_model("project-123::task-456::finetune-789")
|
|
@@ -522,16 +589,28 @@ def test_finetune_provider_model_structured_mode(
|
|
|
522
589
|
assert provider.name == provider_name
|
|
523
590
|
assert provider.model_id == "fireworks-model-123"
|
|
524
591
|
assert provider.structured_output_mode == expected_mode
|
|
592
|
+
assert provider.reasoning_capable is False
|
|
593
|
+
assert provider.parser == None
|
|
525
594
|
|
|
526
595
|
|
|
527
596
|
def test_openai_compatible_provider_config(mock_shared_config):
|
|
528
597
|
"""Test successful creation of an OpenAI compatible provider"""
|
|
529
598
|
model_id = "test_provider::gpt-4"
|
|
530
599
|
|
|
531
|
-
config =
|
|
600
|
+
config = lite_llm_config_for_openai_compatible(
|
|
601
|
+
RunConfigProperties(
|
|
602
|
+
model_name=model_id,
|
|
603
|
+
model_provider_name=ModelProviderName.openai_compatible,
|
|
604
|
+
prompt_id="simple_prompt_builder",
|
|
605
|
+
structured_output_mode="json_schema",
|
|
606
|
+
)
|
|
607
|
+
)
|
|
532
608
|
|
|
533
|
-
assert
|
|
534
|
-
|
|
609
|
+
assert (
|
|
610
|
+
config.run_config_properties.model_provider_name
|
|
611
|
+
== ModelProviderName.openai_compatible
|
|
612
|
+
)
|
|
613
|
+
assert config.run_config_properties.model_name == "gpt-4"
|
|
535
614
|
assert config.additional_body_options == {"api_key": "test-key"}
|
|
536
615
|
assert config.base_url == "https://api.test.com"
|
|
537
616
|
|
|
@@ -553,10 +632,20 @@ def test_lite_llm_config_no_api_key(mock_shared_config):
|
|
|
553
632
|
"""Test provider creation without API key (should work as some providers don't require it, but should pass NA to LiteLLM as it requires one)"""
|
|
554
633
|
model_id = "no_key_provider::gpt-4"
|
|
555
634
|
|
|
556
|
-
config =
|
|
635
|
+
config = lite_llm_config_for_openai_compatible(
|
|
636
|
+
RunConfigProperties(
|
|
637
|
+
model_name=model_id,
|
|
638
|
+
model_provider_name=ModelProviderName.openai,
|
|
639
|
+
prompt_id="simple_prompt_builder",
|
|
640
|
+
structured_output_mode="json_schema",
|
|
641
|
+
)
|
|
642
|
+
)
|
|
557
643
|
|
|
558
|
-
assert
|
|
559
|
-
|
|
644
|
+
assert (
|
|
645
|
+
config.run_config_properties.model_provider_name
|
|
646
|
+
== ModelProviderName.openai_compatible
|
|
647
|
+
)
|
|
648
|
+
assert config.run_config_properties.model_name == "gpt-4"
|
|
560
649
|
assert config.additional_body_options == {"api_key": "NA"}
|
|
561
650
|
assert config.base_url == "https://api.nokey.com"
|
|
562
651
|
|
|
@@ -564,7 +653,14 @@ def test_lite_llm_config_no_api_key(mock_shared_config):
|
|
|
564
653
|
def test_lite_llm_config_invalid_id():
|
|
565
654
|
"""Test handling of invalid model ID format"""
|
|
566
655
|
with pytest.raises(ValueError) as exc_info:
|
|
567
|
-
|
|
656
|
+
lite_llm_config_for_openai_compatible(
|
|
657
|
+
RunConfigProperties(
|
|
658
|
+
model_name="invalid-id-format",
|
|
659
|
+
model_provider_name=ModelProviderName.openai_compatible,
|
|
660
|
+
prompt_id="simple_prompt_builder",
|
|
661
|
+
structured_output_mode="json_schema",
|
|
662
|
+
)
|
|
663
|
+
)
|
|
568
664
|
assert (
|
|
569
665
|
str(exc_info.value) == "Invalid openai compatible model ID: invalid-id-format"
|
|
570
666
|
)
|
|
@@ -575,14 +671,28 @@ def test_lite_llm_config_no_providers(mock_shared_config):
|
|
|
575
671
|
mock_shared_config.return_value.openai_compatible_providers = None
|
|
576
672
|
|
|
577
673
|
with pytest.raises(ValueError) as exc_info:
|
|
578
|
-
|
|
674
|
+
lite_llm_config_for_openai_compatible(
|
|
675
|
+
RunConfigProperties(
|
|
676
|
+
model_name="test_provider::gpt-4",
|
|
677
|
+
model_provider_name=ModelProviderName.openai_compatible,
|
|
678
|
+
prompt_id="simple_prompt_builder",
|
|
679
|
+
structured_output_mode="json_schema",
|
|
680
|
+
)
|
|
681
|
+
)
|
|
579
682
|
assert str(exc_info.value) == "OpenAI compatible provider test_provider not found"
|
|
580
683
|
|
|
581
684
|
|
|
582
685
|
def test_lite_llm_config_provider_not_found(mock_shared_config):
|
|
583
686
|
"""Test handling of non-existent provider"""
|
|
584
687
|
with pytest.raises(ValueError) as exc_info:
|
|
585
|
-
|
|
688
|
+
lite_llm_config_for_openai_compatible(
|
|
689
|
+
RunConfigProperties(
|
|
690
|
+
model_name="unknown_provider::gpt-4",
|
|
691
|
+
model_provider_name=ModelProviderName.openai_compatible,
|
|
692
|
+
prompt_id="simple_prompt_builder",
|
|
693
|
+
structured_output_mode="json_schema",
|
|
694
|
+
)
|
|
695
|
+
)
|
|
586
696
|
assert (
|
|
587
697
|
str(exc_info.value) == "OpenAI compatible provider unknown_provider not found"
|
|
588
698
|
)
|
|
@@ -598,7 +708,14 @@ def test_lite_llm_config_no_base_url(mock_shared_config):
|
|
|
598
708
|
]
|
|
599
709
|
|
|
600
710
|
with pytest.raises(ValueError) as exc_info:
|
|
601
|
-
|
|
711
|
+
lite_llm_config_for_openai_compatible(
|
|
712
|
+
RunConfigProperties(
|
|
713
|
+
model_name="test_provider::gpt-4",
|
|
714
|
+
model_provider_name=ModelProviderName.openai_compatible,
|
|
715
|
+
prompt_id="simple_prompt_builder",
|
|
716
|
+
structured_output_mode="json_schema",
|
|
717
|
+
)
|
|
718
|
+
)
|
|
602
719
|
assert (
|
|
603
720
|
str(exc_info.value)
|
|
604
721
|
== "OpenAI compatible provider test_provider has no base URL"
|
|
@@ -799,6 +916,7 @@ def test_finetune_provider_model_vertex_ai(mock_project, mock_task, mock_finetun
|
|
|
799
916
|
finetune.provider = ModelProviderName.vertex
|
|
800
917
|
finetune.fine_tune_model_id = "projects/123/locations/us-central1/endpoints/456"
|
|
801
918
|
finetune.structured_output_mode = StructuredOutputMode.json_mode
|
|
919
|
+
finetune.data_strategy = ChatStrategy.single_turn
|
|
802
920
|
mock_finetune.return_value = finetune
|
|
803
921
|
|
|
804
922
|
provider = finetune_provider_model("project-123::task-456::finetune-789")
|
kiln_ai/datamodel/__init__.py
CHANGED
|
@@ -13,7 +13,6 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
from kiln_ai.datamodel import dataset_split, eval, strict_mode
|
|
15
15
|
from kiln_ai.datamodel.datamodel_enums import (
|
|
16
|
-
FinetuneDataStrategy,
|
|
17
16
|
FineTuneStatusType,
|
|
18
17
|
Priority,
|
|
19
18
|
StructuredOutputMode,
|
|
@@ -44,6 +43,7 @@ from kiln_ai.datamodel.task_output import (
|
|
|
44
43
|
)
|
|
45
44
|
from kiln_ai.datamodel.task_run import (
|
|
46
45
|
TaskRun,
|
|
46
|
+
Usage,
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
__all__ = [
|
|
@@ -70,8 +70,8 @@ __all__ = [
|
|
|
70
70
|
"Prompt",
|
|
71
71
|
"TaskOutputRating",
|
|
72
72
|
"StructuredOutputMode",
|
|
73
|
-
"FinetuneDataStrategy",
|
|
74
73
|
"PromptId",
|
|
75
74
|
"PromptGenerators",
|
|
76
75
|
"prompt_generator_values",
|
|
76
|
+
"Usage",
|
|
77
77
|
]
|
|
@@ -24,13 +24,14 @@ class StructuredOutputMode(str, Enum):
|
|
|
24
24
|
"""
|
|
25
25
|
Enumeration of supported structured output modes.
|
|
26
26
|
|
|
27
|
-
- default: let the adapter decide
|
|
28
27
|
- json_schema: request json using API capabilities for json_schema
|
|
29
28
|
- function_calling: request json using API capabilities for function calling
|
|
30
29
|
- json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
|
|
31
30
|
- json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
|
|
32
31
|
- json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
|
|
33
32
|
- json_custom_instructions: The model should output JSON, but custom instructions are already included in the system prompt. Don't append additional JSON instructions.
|
|
33
|
+
- default: let the adapter decide (legacy, do not use for new use cases)
|
|
34
|
+
- unknown: used for cases where the structured output mode is not known (on old models where it wasn't saved). Should lookup best option at runtime.
|
|
34
35
|
"""
|
|
35
36
|
|
|
36
37
|
default = "default"
|
|
@@ -41,6 +42,7 @@ class StructuredOutputMode(str, Enum):
|
|
|
41
42
|
json_instructions = "json_instructions"
|
|
42
43
|
json_instruction_and_object = "json_instruction_and_object"
|
|
43
44
|
json_custom_instructions = "json_custom_instructions"
|
|
45
|
+
unknown = "unknown"
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
class FineTuneStatusType(str, Enum):
|
|
@@ -55,6 +57,43 @@ class FineTuneStatusType(str, Enum):
|
|
|
55
57
|
failed = "failed"
|
|
56
58
|
|
|
57
59
|
|
|
58
|
-
class
|
|
59
|
-
|
|
60
|
-
|
|
60
|
+
class ChatStrategy(str, Enum):
|
|
61
|
+
"""Strategy for how a chat is structured."""
|
|
62
|
+
|
|
63
|
+
# Single turn, immediately return the answer
|
|
64
|
+
single_turn = "final_only"
|
|
65
|
+
# Two turn, first turn is the thinking, second turn is the answer. Legacy format - used for old fine tunes but not new trains.
|
|
66
|
+
two_message_cot_legacy = "final_and_intermediate"
|
|
67
|
+
# Two turn, first turn is the thinking, second turn is the answer. New format - used for new trains.
|
|
68
|
+
two_message_cot = "two_message_cot"
|
|
69
|
+
# Single turn, with both the thinking and the answer in the same message, using R1-style thinking format in <think> tags
|
|
70
|
+
single_turn_r1_thinking = "final_and_intermediate_r1_compatible"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
THINKING_DATA_STRATEGIES: list[ChatStrategy] = [
|
|
74
|
+
ChatStrategy.two_message_cot_legacy,
|
|
75
|
+
ChatStrategy.single_turn_r1_thinking,
|
|
76
|
+
ChatStrategy.two_message_cot,
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ModelProviderName(str, Enum):
|
|
81
|
+
"""
|
|
82
|
+
Enumeration of supported AI model providers.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
openai = "openai"
|
|
86
|
+
groq = "groq"
|
|
87
|
+
amazon_bedrock = "amazon_bedrock"
|
|
88
|
+
ollama = "ollama"
|
|
89
|
+
openrouter = "openrouter"
|
|
90
|
+
fireworks_ai = "fireworks_ai"
|
|
91
|
+
kiln_fine_tune = "kiln_fine_tune"
|
|
92
|
+
kiln_custom_registry = "kiln_custom_registry"
|
|
93
|
+
openai_compatible = "openai_compatible"
|
|
94
|
+
anthropic = "anthropic"
|
|
95
|
+
gemini_api = "gemini_api"
|
|
96
|
+
azure_openai = "azure_openai"
|
|
97
|
+
huggingface = "huggingface"
|
|
98
|
+
vertex = "vertex"
|
|
99
|
+
together_ai = "together_ai"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from enum import Enum
|
|
2
|
-
from typing import Annotated, Protocol
|
|
3
|
+
from typing import Annotated, ClassVar, List, Protocol
|
|
3
4
|
|
|
4
5
|
from pydantic import AfterValidator
|
|
5
6
|
|
|
@@ -59,6 +60,65 @@ class TagFilter:
|
|
|
59
60
|
return self.tag in task_run.tags
|
|
60
61
|
|
|
61
62
|
|
|
63
|
+
class MultiDatasetFilter:
|
|
64
|
+
"""
|
|
65
|
+
A filter that combines multiple filters using AND logic.
|
|
66
|
+
The filters are specified in a query string format after 'multi_filter::'
|
|
67
|
+
Example: multi_filter::high_rating&thinking_model&tag::tag_name
|
|
68
|
+
|
|
69
|
+
Ampersands in filter IDs can be escaped with a backslash.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
PREFIX: ClassVar[str] = "multi_filter::"
|
|
73
|
+
ESCAPED_AMPERSAND: ClassVar[str] = r"\&"
|
|
74
|
+
UNESCAPED_AMPERSAND: ClassVar[str] = "&"
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def parse_filter_string(cls, filter_string: str) -> List[str]:
|
|
78
|
+
"""
|
|
79
|
+
Parse a filter string into individual filter IDs, handling escaped ampersands.
|
|
80
|
+
"""
|
|
81
|
+
if not filter_string.startswith(cls.PREFIX):
|
|
82
|
+
raise ValueError(f"Filter string must start with {cls.PREFIX}")
|
|
83
|
+
|
|
84
|
+
# Remove the prefix
|
|
85
|
+
content = filter_string[len(cls.PREFIX) :]
|
|
86
|
+
if not content:
|
|
87
|
+
raise ValueError("No filters specified after prefix")
|
|
88
|
+
|
|
89
|
+
# Split on unescaped ampersands
|
|
90
|
+
# This regex matches & that are not preceded by a backslash
|
|
91
|
+
parts = re.split(r"(?<!\\)&", content)
|
|
92
|
+
|
|
93
|
+
# Unescape ampersands in each part
|
|
94
|
+
filter_ids = [
|
|
95
|
+
part.replace(cls.ESCAPED_AMPERSAND, cls.UNESCAPED_AMPERSAND)
|
|
96
|
+
for part in parts
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
# Validate each filter ID using the existing validation
|
|
100
|
+
for fid in filter_ids:
|
|
101
|
+
_check_dataset_filter_id(fid)
|
|
102
|
+
|
|
103
|
+
return filter_ids
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def is_valid_filter_string(cls, filter_string: str) -> bool:
|
|
107
|
+
"""Check if a filter string is valid."""
|
|
108
|
+
try:
|
|
109
|
+
cls.parse_filter_string(filter_string)
|
|
110
|
+
return True
|
|
111
|
+
except ValueError:
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
def __init__(self, filter_id: str):
|
|
115
|
+
filter_ids = MultiDatasetFilter.parse_filter_string(filter_id)
|
|
116
|
+
self.filters = [dataset_filter_from_id(fid) for fid in filter_ids]
|
|
117
|
+
|
|
118
|
+
def __call__(self, task_run: TaskRun) -> bool:
|
|
119
|
+
return all(f(task_run) for f in self.filters)
|
|
120
|
+
|
|
121
|
+
|
|
62
122
|
class StaticDatasetFilters(str, Enum):
|
|
63
123
|
"""Dataset filter names."""
|
|
64
124
|
|
|
@@ -98,6 +158,11 @@ def _check_dataset_filter_id(id: str) -> str:
|
|
|
98
158
|
if id.startswith("tag::") and len(id) > 5:
|
|
99
159
|
return id
|
|
100
160
|
|
|
161
|
+
if id.startswith(MultiDatasetFilter.PREFIX):
|
|
162
|
+
if not MultiDatasetFilter.is_valid_filter_string(id):
|
|
163
|
+
raise ValueError(f"Invalid multi-filter string: {id}")
|
|
164
|
+
return id
|
|
165
|
+
|
|
101
166
|
raise ValueError(f"Invalid dataset filter ID: {id}")
|
|
102
167
|
|
|
103
168
|
|
|
@@ -108,6 +173,9 @@ def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter:
|
|
|
108
173
|
if id.startswith("tag::") and len(id) > 5:
|
|
109
174
|
return TagFilter(id[5:])
|
|
110
175
|
|
|
176
|
+
if id.startswith(MultiDatasetFilter.PREFIX):
|
|
177
|
+
return MultiDatasetFilter(id)
|
|
178
|
+
|
|
111
179
|
if id in static_dataset_filters:
|
|
112
180
|
return static_dataset_filters[id]
|
|
113
181
|
|
|
@@ -45,6 +45,10 @@ Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
|
45
45
|
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
46
46
|
DatasetSplitDefinition(name="test", percentage=0.2),
|
|
47
47
|
]
|
|
48
|
+
Train80Val20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
49
|
+
DatasetSplitDefinition(name="train", percentage=0.8),
|
|
50
|
+
DatasetSplitDefinition(name="val", percentage=0.2),
|
|
51
|
+
]
|
|
48
52
|
Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
|
|
49
53
|
DatasetSplitDefinition(name="train", percentage=0.6),
|
|
50
54
|
DatasetSplitDefinition(name="test", percentage=0.2),
|
kiln_ai/datamodel/eval.py
CHANGED
|
@@ -263,6 +263,10 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
|
|
|
263
263
|
default=None,
|
|
264
264
|
description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
|
|
265
265
|
)
|
|
266
|
+
current_run_config_id: ID_TYPE = Field(
|
|
267
|
+
default=None,
|
|
268
|
+
description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.",
|
|
269
|
+
)
|
|
266
270
|
eval_set_filter_id: DatasetFilterId = Field(
|
|
267
271
|
description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
|
|
268
272
|
)
|
|
@@ -272,6 +276,10 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
|
|
|
272
276
|
output_scores: List[EvalOutputScore] = Field(
|
|
273
277
|
description="The scores this evaluator should produce."
|
|
274
278
|
)
|
|
279
|
+
favourite: bool = Field(
|
|
280
|
+
default=False,
|
|
281
|
+
description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
|
|
282
|
+
)
|
|
275
283
|
|
|
276
284
|
# Workaround to return typed parent without importing Task
|
|
277
285
|
def parent_task(self) -> Union["Task", None]:
|
kiln_ai/datamodel/finetune.py
CHANGED
|
@@ -5,7 +5,8 @@ from typing_extensions import Self
|
|
|
5
5
|
|
|
6
6
|
from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
|
|
7
7
|
from kiln_ai.datamodel.datamodel_enums import (
|
|
8
|
-
|
|
8
|
+
THINKING_DATA_STRATEGIES,
|
|
9
|
+
ChatStrategy,
|
|
9
10
|
FineTuneStatusType,
|
|
10
11
|
StructuredOutputMode,
|
|
11
12
|
)
|
|
@@ -13,6 +14,11 @@ from kiln_ai.datamodel.datamodel_enums import (
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from kiln_ai.datamodel.task import Task
|
|
15
16
|
|
|
17
|
+
DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS = [
|
|
18
|
+
ChatStrategy.two_message_cot_legacy,
|
|
19
|
+
ChatStrategy.two_message_cot,
|
|
20
|
+
]
|
|
21
|
+
|
|
16
22
|
|
|
17
23
|
class Finetune(KilnParentedModel):
|
|
18
24
|
"""
|
|
@@ -75,8 +81,8 @@ class Finetune(KilnParentedModel):
|
|
|
75
81
|
default={},
|
|
76
82
|
description="Properties of the fine-tune. Different providers may use different properties.",
|
|
77
83
|
)
|
|
78
|
-
data_strategy:
|
|
79
|
-
default=
|
|
84
|
+
data_strategy: ChatStrategy = Field(
|
|
85
|
+
default=ChatStrategy.single_turn,
|
|
80
86
|
description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
|
|
81
87
|
)
|
|
82
88
|
|
|
@@ -90,16 +96,16 @@ class Finetune(KilnParentedModel):
|
|
|
90
96
|
def validate_thinking_instructions(self) -> Self:
|
|
91
97
|
if (
|
|
92
98
|
self.thinking_instructions is not None
|
|
93
|
-
and self.data_strategy
|
|
99
|
+
and self.data_strategy not in DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS
|
|
94
100
|
):
|
|
95
101
|
raise ValueError(
|
|
96
|
-
"Thinking instructions can only be used when data_strategy is
|
|
102
|
+
f"Thinking instructions can only be used when data_strategy is one of the following: {DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS}"
|
|
97
103
|
)
|
|
98
104
|
if (
|
|
99
105
|
self.thinking_instructions is None
|
|
100
|
-
and self.data_strategy
|
|
106
|
+
and self.data_strategy in DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS
|
|
101
107
|
):
|
|
102
108
|
raise ValueError(
|
|
103
|
-
"Thinking instructions are required when data_strategy is
|
|
109
|
+
f"Thinking instructions are required when data_strategy is one of the following: {DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS}"
|
|
104
110
|
)
|
|
105
111
|
return self
|
kiln_ai/datamodel/prompt_id.py
CHANGED
|
@@ -13,6 +13,7 @@ class PromptGenerators(str, Enum):
|
|
|
13
13
|
SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder"
|
|
14
14
|
FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder"
|
|
15
15
|
MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder"
|
|
16
|
+
SHORT = "short_prompt_builder"
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
prompt_generator_values = [pg.value for pg in PromptGenerators]
|
kiln_ai/datamodel/task.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Dict, List, Union
|
|
2
2
|
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
3
|
+
from pydantic import BaseModel, Field, ValidationInfo, model_validator
|
|
4
|
+
from typing_extensions import Self
|
|
4
5
|
|
|
5
6
|
from kiln_ai.datamodel import Finetune
|
|
6
7
|
from kiln_ai.datamodel.basemodel import (
|
|
@@ -11,7 +12,12 @@ from kiln_ai.datamodel.basemodel import (
|
|
|
11
12
|
KilnParentedModel,
|
|
12
13
|
KilnParentModel,
|
|
13
14
|
)
|
|
14
|
-
from kiln_ai.datamodel.datamodel_enums import
|
|
15
|
+
from kiln_ai.datamodel.datamodel_enums import (
|
|
16
|
+
ModelProviderName,
|
|
17
|
+
Priority,
|
|
18
|
+
StructuredOutputMode,
|
|
19
|
+
TaskOutputRatingType,
|
|
20
|
+
)
|
|
15
21
|
from kiln_ai.datamodel.dataset_split import DatasetSplit
|
|
16
22
|
from kiln_ai.datamodel.eval import Eval
|
|
17
23
|
from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
|
|
@@ -47,12 +53,33 @@ class RunConfigProperties(BaseModel):
|
|
|
47
53
|
"""
|
|
48
54
|
|
|
49
55
|
model_name: str = Field(description="The model to use for this run config.")
|
|
50
|
-
model_provider_name:
|
|
56
|
+
model_provider_name: ModelProviderName = Field(
|
|
51
57
|
description="The provider to use for this run config."
|
|
52
58
|
)
|
|
53
59
|
prompt_id: PromptId = Field(
|
|
54
60
|
description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
|
|
55
61
|
)
|
|
62
|
+
top_p: float = Field(
|
|
63
|
+
default=1.0,
|
|
64
|
+
description="The top-p value to use for this run config. Defaults to 1.0.",
|
|
65
|
+
)
|
|
66
|
+
temperature: float = Field(
|
|
67
|
+
default=1.0,
|
|
68
|
+
description="The temperature to use for this run config. Defaults to 1.0.",
|
|
69
|
+
)
|
|
70
|
+
structured_output_mode: StructuredOutputMode = Field(
|
|
71
|
+
description="The structured output mode to use for this run config.",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@model_validator(mode="after")
|
|
75
|
+
def validate_required_fields(self) -> Self:
|
|
76
|
+
if not (0 <= self.top_p <= 1):
|
|
77
|
+
raise ValueError("top_p must be between 0 and 1")
|
|
78
|
+
|
|
79
|
+
elif self.temperature < 0 or self.temperature > 2:
|
|
80
|
+
raise ValueError("temperature must be between 0 and 2")
|
|
81
|
+
|
|
82
|
+
return self
|
|
56
83
|
|
|
57
84
|
|
|
58
85
|
class RunConfig(RunConfigProperties):
|
|
@@ -101,12 +128,46 @@ class TaskRunConfig(KilnParentedModel):
|
|
|
101
128
|
parent_task = self.parent_task()
|
|
102
129
|
if parent_task is None:
|
|
103
130
|
raise ValueError("Run config must be parented to a task")
|
|
104
|
-
return
|
|
131
|
+
return run_config_from_run_config_properties(
|
|
105
132
|
task=parent_task,
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
133
|
+
run_config_properties=self.run_config_properties,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Previously we didn't store structured_output_mode in the run_config_properties. Updgrade old models when loading from file.
|
|
137
|
+
@model_validator(mode="before")
|
|
138
|
+
def upgrade_old_entries(cls, data: dict, info: ValidationInfo) -> dict:
|
|
139
|
+
if not info.context or not info.context.get("loading_from_file", False):
|
|
140
|
+
# Not loading from file, so no need to upgrade
|
|
141
|
+
return data
|
|
142
|
+
|
|
143
|
+
if not isinstance(data, dict):
|
|
144
|
+
return data
|
|
145
|
+
|
|
146
|
+
structured_output_mode = data.get("run_config_properties", {}).get(
|
|
147
|
+
"structured_output_mode", None
|
|
109
148
|
)
|
|
149
|
+
if structured_output_mode is None and "run_config_properties" in data:
|
|
150
|
+
# Default to unknown. Adapter will have to guess at runtime.
|
|
151
|
+
data["run_config_properties"]["structured_output_mode"] = (
|
|
152
|
+
StructuredOutputMode.unknown
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return data
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def run_config_from_run_config_properties(
|
|
159
|
+
task: "Task",
|
|
160
|
+
run_config_properties: RunConfigProperties,
|
|
161
|
+
) -> RunConfig:
|
|
162
|
+
return RunConfig(
|
|
163
|
+
task=task,
|
|
164
|
+
model_name=run_config_properties.model_name,
|
|
165
|
+
model_provider_name=run_config_properties.model_provider_name,
|
|
166
|
+
prompt_id=run_config_properties.prompt_id,
|
|
167
|
+
top_p=run_config_properties.top_p,
|
|
168
|
+
temperature=run_config_properties.temperature,
|
|
169
|
+
structured_output_mode=run_config_properties.structured_output_mode,
|
|
170
|
+
)
|
|
110
171
|
|
|
111
172
|
|
|
112
173
|
class Task(
|
kiln_ai/datamodel/task_output.py
CHANGED
|
@@ -64,7 +64,7 @@ class TaskOutputRating(KilnBaseModel):
|
|
|
64
64
|
)
|
|
65
65
|
requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
|
|
66
66
|
default={},
|
|
67
|
-
description="The ratings of the requirements of the task.",
|
|
67
|
+
description="The ratings of the requirements of the task. The ID can be either a task_requirement_id or a named rating for an eval_output_score name (in format 'named::<name>').",
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
# Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.
|