kiln-ai 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kiln-ai might be problematic. Click here for more details.

Files changed (72) hide show
  1. kiln_ai/adapters/__init__.py +2 -0
  2. kiln_ai/adapters/adapter_registry.py +22 -44
  3. kiln_ai/adapters/chat/__init__.py +8 -0
  4. kiln_ai/adapters/chat/chat_formatter.py +234 -0
  5. kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
  6. kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
  7. kiln_ai/adapters/eval/base_eval.py +8 -6
  8. kiln_ai/adapters/eval/eval_runner.py +9 -65
  9. kiln_ai/adapters/eval/g_eval.py +26 -8
  10. kiln_ai/adapters/eval/test_base_eval.py +166 -15
  11. kiln_ai/adapters/eval/test_eval_runner.py +3 -0
  12. kiln_ai/adapters/eval/test_g_eval.py +1 -0
  13. kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
  14. kiln_ai/adapters/fine_tune/dataset_formatter.py +153 -197
  15. kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
  16. kiln_ai/adapters/fine_tune/test_dataset_formatter.py +402 -211
  17. kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
  18. kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
  19. kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
  20. kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
  21. kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
  22. kiln_ai/adapters/ml_model_list.py +556 -45
  23. kiln_ai/adapters/model_adapters/base_adapter.py +100 -35
  24. kiln_ai/adapters/model_adapters/litellm_adapter.py +116 -100
  25. kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
  26. kiln_ai/adapters/model_adapters/test_base_adapter.py +299 -52
  27. kiln_ai/adapters/model_adapters/test_litellm_adapter.py +121 -22
  28. kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +44 -2
  29. kiln_ai/adapters/model_adapters/test_structured_output.py +48 -18
  30. kiln_ai/adapters/parsers/base_parser.py +0 -3
  31. kiln_ai/adapters/parsers/parser_registry.py +5 -3
  32. kiln_ai/adapters/parsers/r1_parser.py +17 -2
  33. kiln_ai/adapters/parsers/request_formatters.py +40 -0
  34. kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
  35. kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
  36. kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
  37. kiln_ai/adapters/prompt_builders.py +14 -17
  38. kiln_ai/adapters/provider_tools.py +39 -4
  39. kiln_ai/adapters/repair/test_repair_task.py +27 -5
  40. kiln_ai/adapters/test_adapter_registry.py +88 -28
  41. kiln_ai/adapters/test_ml_model_list.py +158 -0
  42. kiln_ai/adapters/test_prompt_adaptors.py +17 -3
  43. kiln_ai/adapters/test_prompt_builders.py +27 -19
  44. kiln_ai/adapters/test_provider_tools.py +130 -12
  45. kiln_ai/datamodel/__init__.py +2 -2
  46. kiln_ai/datamodel/datamodel_enums.py +43 -4
  47. kiln_ai/datamodel/dataset_filters.py +69 -1
  48. kiln_ai/datamodel/dataset_split.py +4 -0
  49. kiln_ai/datamodel/eval.py +8 -0
  50. kiln_ai/datamodel/finetune.py +13 -7
  51. kiln_ai/datamodel/prompt_id.py +1 -0
  52. kiln_ai/datamodel/task.py +68 -7
  53. kiln_ai/datamodel/task_output.py +1 -1
  54. kiln_ai/datamodel/task_run.py +39 -7
  55. kiln_ai/datamodel/test_basemodel.py +5 -8
  56. kiln_ai/datamodel/test_dataset_filters.py +82 -0
  57. kiln_ai/datamodel/test_dataset_split.py +2 -8
  58. kiln_ai/datamodel/test_example_models.py +54 -0
  59. kiln_ai/datamodel/test_models.py +80 -9
  60. kiln_ai/datamodel/test_task.py +168 -2
  61. kiln_ai/utils/async_job_runner.py +106 -0
  62. kiln_ai/utils/config.py +3 -2
  63. kiln_ai/utils/dataset_import.py +81 -19
  64. kiln_ai/utils/logging.py +165 -0
  65. kiln_ai/utils/test_async_job_runner.py +199 -0
  66. kiln_ai/utils/test_config.py +23 -0
  67. kiln_ai/utils/test_dataset_import.py +272 -10
  68. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
  69. kiln_ai-0.17.0.dist-info/RECORD +113 -0
  70. kiln_ai-0.15.0.dist-info/RECORD +0 -104
  71. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
  72. {kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0
@@ -5,6 +5,7 @@ import pytest
5
5
  from kiln_ai.adapters.ml_model_list import (
6
6
  KilnModel,
7
7
  ModelName,
8
+ ModelParserID,
8
9
  ModelProviderName,
9
10
  )
10
11
  from kiln_ai.adapters.ollama_tools import OllamaConnection
@@ -17,14 +18,20 @@ from kiln_ai.adapters.provider_tools import (
17
18
  finetune_provider_model,
18
19
  get_model_and_provider,
19
20
  kiln_model_provider_from,
20
- lite_llm_config,
21
+ lite_llm_config_for_openai_compatible,
21
22
  lite_llm_provider_model,
22
23
  parse_custom_model_id,
23
24
  provider_enabled,
24
25
  provider_name_from_id,
25
26
  provider_warnings,
26
27
  )
27
- from kiln_ai.datamodel import Finetune, StructuredOutputMode, Task
28
+ from kiln_ai.datamodel import (
29
+ Finetune,
30
+ StructuredOutputMode,
31
+ Task,
32
+ )
33
+ from kiln_ai.datamodel.datamodel_enums import ChatStrategy
34
+ from kiln_ai.datamodel.task import RunConfigProperties
28
35
 
29
36
 
30
37
  @pytest.fixture(autouse=True)
@@ -65,6 +72,31 @@ def mock_finetune():
65
72
  finetune.provider = ModelProviderName.openai
66
73
  finetune.fine_tune_model_id = "ft:gpt-3.5-turbo:custom:model-123"
67
74
  finetune.structured_output_mode = StructuredOutputMode.json_schema
75
+ finetune.data_strategy = ChatStrategy.single_turn
76
+ mock.return_value = finetune
77
+ yield mock
78
+
79
+
80
+ @pytest.fixture
81
+ def mock_finetune_final_and_intermediate():
82
+ with patch("kiln_ai.datamodel.Finetune.from_id_and_parent_path") as mock:
83
+ finetune = Mock(spec=Finetune)
84
+ finetune.provider = ModelProviderName.openai
85
+ finetune.fine_tune_model_id = "ft:gpt-3.5-turbo:custom:model-123"
86
+ finetune.structured_output_mode = StructuredOutputMode.json_schema
87
+ finetune.data_strategy = ChatStrategy.two_message_cot
88
+ mock.return_value = finetune
89
+ yield mock
90
+
91
+
92
+ @pytest.fixture
93
+ def mock_finetune_r1_compatible():
94
+ with patch("kiln_ai.datamodel.Finetune.from_id_and_parent_path") as mock:
95
+ finetune = Mock(spec=Finetune)
96
+ finetune.provider = ModelProviderName.ollama
97
+ finetune.fine_tune_model_id = "ft:deepseek-r1:671b:custom:model-123"
98
+ finetune.structured_output_mode = StructuredOutputMode.json_schema
99
+ finetune.data_strategy = ChatStrategy.single_turn_r1_thinking
68
100
  mock.return_value = finetune
69
101
  yield mock
70
102
 
@@ -324,6 +356,7 @@ async def test_kiln_model_provider_from_custom_model_valid(mock_config):
324
356
  assert provider.supports_data_gen is False
325
357
  assert provider.untested_model is True
326
358
  assert provider.model_id == "custom_model"
359
+ assert provider.structured_output_mode == StructuredOutputMode.json_instructions
327
360
 
328
361
 
329
362
  @pytest.mark.asyncio
@@ -341,6 +374,7 @@ async def test_kiln_model_provider_from_custom_registry(mock_config):
341
374
  assert provider.supports_data_gen is False
342
375
  assert provider.untested_model is True
343
376
  assert provider.model_id == "gpt-4-turbo"
377
+ assert provider.structured_output_mode == StructuredOutputMode.json_instructions
344
378
 
345
379
 
346
380
  @pytest.mark.asyncio
@@ -426,6 +460,38 @@ def test_finetune_provider_model_success(mock_project, mock_task, mock_finetune)
426
460
  assert provider.name == ModelProviderName.openai
427
461
  assert provider.model_id == "ft:gpt-3.5-turbo:custom:model-123"
428
462
  assert provider.structured_output_mode == StructuredOutputMode.json_schema
463
+ assert provider.reasoning_capable is False
464
+ assert provider.parser == None
465
+
466
+
467
+ def test_finetune_provider_model_success_final_and_intermediate(
468
+ mock_project, mock_task, mock_finetune_final_and_intermediate
469
+ ):
470
+ """Test successful creation of a fine-tuned model provider"""
471
+ model_id = "project-123::task-456::finetune-789"
472
+
473
+ provider = finetune_provider_model(model_id)
474
+
475
+ assert provider.name == ModelProviderName.openai
476
+ assert provider.model_id == "ft:gpt-3.5-turbo:custom:model-123"
477
+ assert provider.structured_output_mode == StructuredOutputMode.json_schema
478
+ assert provider.reasoning_capable is False
479
+ assert provider.parser == None
480
+
481
+
482
+ def test_finetune_provider_model_success_r1_compatible(
483
+ mock_project, mock_task, mock_finetune_r1_compatible
484
+ ):
485
+ """Test successful creation of a fine-tuned model provider"""
486
+ model_id = "project-123::task-456::finetune-789"
487
+
488
+ provider = finetune_provider_model(model_id)
489
+
490
+ assert provider.name == ModelProviderName.ollama
491
+ assert provider.model_id == "ft:deepseek-r1:671b:custom:model-123"
492
+ assert provider.structured_output_mode == StructuredOutputMode.json_schema
493
+ assert provider.reasoning_capable is True
494
+ assert provider.parser == ModelParserID.r1_thinking
429
495
 
430
496
 
431
497
  def test_finetune_provider_model_invalid_id():
@@ -515,6 +581,7 @@ def test_finetune_provider_model_structured_mode(
515
581
  finetune.provider = provider_name
516
582
  finetune.fine_tune_model_id = "fireworks-model-123"
517
583
  finetune.structured_output_mode = structured_output_mode
584
+ finetune.data_strategy = ChatStrategy.single_turn
518
585
  mock_finetune.return_value = finetune
519
586
 
520
587
  provider = finetune_provider_model("project-123::task-456::finetune-789")
@@ -522,16 +589,28 @@ def test_finetune_provider_model_structured_mode(
522
589
  assert provider.name == provider_name
523
590
  assert provider.model_id == "fireworks-model-123"
524
591
  assert provider.structured_output_mode == expected_mode
592
+ assert provider.reasoning_capable is False
593
+ assert provider.parser == None
525
594
 
526
595
 
527
596
  def test_openai_compatible_provider_config(mock_shared_config):
528
597
  """Test successful creation of an OpenAI compatible provider"""
529
598
  model_id = "test_provider::gpt-4"
530
599
 
531
- config = lite_llm_config(model_id)
600
+ config = lite_llm_config_for_openai_compatible(
601
+ RunConfigProperties(
602
+ model_name=model_id,
603
+ model_provider_name=ModelProviderName.openai_compatible,
604
+ prompt_id="simple_prompt_builder",
605
+ structured_output_mode="json_schema",
606
+ )
607
+ )
532
608
 
533
- assert config.provider_name == ModelProviderName.openai_compatible
534
- assert config.model_name == "gpt-4"
609
+ assert (
610
+ config.run_config_properties.model_provider_name
611
+ == ModelProviderName.openai_compatible
612
+ )
613
+ assert config.run_config_properties.model_name == "gpt-4"
535
614
  assert config.additional_body_options == {"api_key": "test-key"}
536
615
  assert config.base_url == "https://api.test.com"
537
616
 
@@ -553,10 +632,20 @@ def test_lite_llm_config_no_api_key(mock_shared_config):
553
632
  """Test provider creation without API key (should work as some providers don't require it, but should pass NA to LiteLLM as it requires one)"""
554
633
  model_id = "no_key_provider::gpt-4"
555
634
 
556
- config = lite_llm_config(model_id)
635
+ config = lite_llm_config_for_openai_compatible(
636
+ RunConfigProperties(
637
+ model_name=model_id,
638
+ model_provider_name=ModelProviderName.openai,
639
+ prompt_id="simple_prompt_builder",
640
+ structured_output_mode="json_schema",
641
+ )
642
+ )
557
643
 
558
- assert config.provider_name == ModelProviderName.openai_compatible
559
- assert config.model_name == "gpt-4"
644
+ assert (
645
+ config.run_config_properties.model_provider_name
646
+ == ModelProviderName.openai_compatible
647
+ )
648
+ assert config.run_config_properties.model_name == "gpt-4"
560
649
  assert config.additional_body_options == {"api_key": "NA"}
561
650
  assert config.base_url == "https://api.nokey.com"
562
651
 
@@ -564,7 +653,14 @@ def test_lite_llm_config_no_api_key(mock_shared_config):
564
653
  def test_lite_llm_config_invalid_id():
565
654
  """Test handling of invalid model ID format"""
566
655
  with pytest.raises(ValueError) as exc_info:
567
- lite_llm_config("invalid-id-format")
656
+ lite_llm_config_for_openai_compatible(
657
+ RunConfigProperties(
658
+ model_name="invalid-id-format",
659
+ model_provider_name=ModelProviderName.openai_compatible,
660
+ prompt_id="simple_prompt_builder",
661
+ structured_output_mode="json_schema",
662
+ )
663
+ )
568
664
  assert (
569
665
  str(exc_info.value) == "Invalid openai compatible model ID: invalid-id-format"
570
666
  )
@@ -575,14 +671,28 @@ def test_lite_llm_config_no_providers(mock_shared_config):
575
671
  mock_shared_config.return_value.openai_compatible_providers = None
576
672
 
577
673
  with pytest.raises(ValueError) as exc_info:
578
- lite_llm_config("test_provider::gpt-4")
674
+ lite_llm_config_for_openai_compatible(
675
+ RunConfigProperties(
676
+ model_name="test_provider::gpt-4",
677
+ model_provider_name=ModelProviderName.openai_compatible,
678
+ prompt_id="simple_prompt_builder",
679
+ structured_output_mode="json_schema",
680
+ )
681
+ )
579
682
  assert str(exc_info.value) == "OpenAI compatible provider test_provider not found"
580
683
 
581
684
 
582
685
  def test_lite_llm_config_provider_not_found(mock_shared_config):
583
686
  """Test handling of non-existent provider"""
584
687
  with pytest.raises(ValueError) as exc_info:
585
- lite_llm_config("unknown_provider::gpt-4")
688
+ lite_llm_config_for_openai_compatible(
689
+ RunConfigProperties(
690
+ model_name="unknown_provider::gpt-4",
691
+ model_provider_name=ModelProviderName.openai_compatible,
692
+ prompt_id="simple_prompt_builder",
693
+ structured_output_mode="json_schema",
694
+ )
695
+ )
586
696
  assert (
587
697
  str(exc_info.value) == "OpenAI compatible provider unknown_provider not found"
588
698
  )
@@ -598,7 +708,14 @@ def test_lite_llm_config_no_base_url(mock_shared_config):
598
708
  ]
599
709
 
600
710
  with pytest.raises(ValueError) as exc_info:
601
- lite_llm_config("test_provider::gpt-4")
711
+ lite_llm_config_for_openai_compatible(
712
+ RunConfigProperties(
713
+ model_name="test_provider::gpt-4",
714
+ model_provider_name=ModelProviderName.openai_compatible,
715
+ prompt_id="simple_prompt_builder",
716
+ structured_output_mode="json_schema",
717
+ )
718
+ )
602
719
  assert (
603
720
  str(exc_info.value)
604
721
  == "OpenAI compatible provider test_provider has no base URL"
@@ -799,6 +916,7 @@ def test_finetune_provider_model_vertex_ai(mock_project, mock_task, mock_finetun
799
916
  finetune.provider = ModelProviderName.vertex
800
917
  finetune.fine_tune_model_id = "projects/123/locations/us-central1/endpoints/456"
801
918
  finetune.structured_output_mode = StructuredOutputMode.json_mode
919
+ finetune.data_strategy = ChatStrategy.single_turn
802
920
  mock_finetune.return_value = finetune
803
921
 
804
922
  provider = finetune_provider_model("project-123::task-456::finetune-789")
@@ -13,7 +13,6 @@ from __future__ import annotations
13
13
 
14
14
  from kiln_ai.datamodel import dataset_split, eval, strict_mode
15
15
  from kiln_ai.datamodel.datamodel_enums import (
16
- FinetuneDataStrategy,
17
16
  FineTuneStatusType,
18
17
  Priority,
19
18
  StructuredOutputMode,
@@ -44,6 +43,7 @@ from kiln_ai.datamodel.task_output import (
44
43
  )
45
44
  from kiln_ai.datamodel.task_run import (
46
45
  TaskRun,
46
+ Usage,
47
47
  )
48
48
 
49
49
  __all__ = [
@@ -70,8 +70,8 @@ __all__ = [
70
70
  "Prompt",
71
71
  "TaskOutputRating",
72
72
  "StructuredOutputMode",
73
- "FinetuneDataStrategy",
74
73
  "PromptId",
75
74
  "PromptGenerators",
76
75
  "prompt_generator_values",
76
+ "Usage",
77
77
  ]
@@ -24,13 +24,14 @@ class StructuredOutputMode(str, Enum):
24
24
  """
25
25
  Enumeration of supported structured output modes.
26
26
 
27
- - default: let the adapter decide
28
27
  - json_schema: request json using API capabilities for json_schema
29
28
  - function_calling: request json using API capabilities for function calling
30
29
  - json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
31
30
  - json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
32
31
  - json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
33
32
  - json_custom_instructions: The model should output JSON, but custom instructions are already included in the system prompt. Don't append additional JSON instructions.
33
+ - default: let the adapter decide (legacy, do not use for new use cases)
34
+ - unknown: used for cases where the structured output mode is not known (on old models where it wasn't saved). Should lookup best option at runtime.
34
35
  """
35
36
 
36
37
  default = "default"
@@ -41,6 +42,7 @@ class StructuredOutputMode(str, Enum):
41
42
  json_instructions = "json_instructions"
42
43
  json_instruction_and_object = "json_instruction_and_object"
43
44
  json_custom_instructions = "json_custom_instructions"
45
+ unknown = "unknown"
44
46
 
45
47
 
46
48
  class FineTuneStatusType(str, Enum):
@@ -55,6 +57,43 @@ class FineTuneStatusType(str, Enum):
55
57
  failed = "failed"
56
58
 
57
59
 
58
- class FinetuneDataStrategy(str, Enum):
59
- final_only = "final_only"
60
- final_and_intermediate = "final_and_intermediate"
60
+ class ChatStrategy(str, Enum):
61
+ """Strategy for how a chat is structured."""
62
+
63
+ # Single turn, immediately return the answer
64
+ single_turn = "final_only"
65
+ # Two turn, first turn is the thinking, second turn is the answer. Legacy format - used for old fine tunes but not new trains.
66
+ two_message_cot_legacy = "final_and_intermediate"
67
+ # Two turn, first turn is the thinking, second turn is the answer. New format - used for new trains.
68
+ two_message_cot = "two_message_cot"
69
+ # Single turn, with both the thinking and the answer in the same message, using R1-style thinking format in <think> tags
70
+ single_turn_r1_thinking = "final_and_intermediate_r1_compatible"
71
+
72
+
73
+ THINKING_DATA_STRATEGIES: list[ChatStrategy] = [
74
+ ChatStrategy.two_message_cot_legacy,
75
+ ChatStrategy.single_turn_r1_thinking,
76
+ ChatStrategy.two_message_cot,
77
+ ]
78
+
79
+
80
+ class ModelProviderName(str, Enum):
81
+ """
82
+ Enumeration of supported AI model providers.
83
+ """
84
+
85
+ openai = "openai"
86
+ groq = "groq"
87
+ amazon_bedrock = "amazon_bedrock"
88
+ ollama = "ollama"
89
+ openrouter = "openrouter"
90
+ fireworks_ai = "fireworks_ai"
91
+ kiln_fine_tune = "kiln_fine_tune"
92
+ kiln_custom_registry = "kiln_custom_registry"
93
+ openai_compatible = "openai_compatible"
94
+ anthropic = "anthropic"
95
+ gemini_api = "gemini_api"
96
+ azure_openai = "azure_openai"
97
+ huggingface = "huggingface"
98
+ vertex = "vertex"
99
+ together_ai = "together_ai"
@@ -1,5 +1,6 @@
1
+ import re
1
2
  from enum import Enum
2
- from typing import Annotated, Protocol
3
+ from typing import Annotated, ClassVar, List, Protocol
3
4
 
4
5
  from pydantic import AfterValidator
5
6
 
@@ -59,6 +60,65 @@ class TagFilter:
59
60
  return self.tag in task_run.tags
60
61
 
61
62
 
63
+ class MultiDatasetFilter:
64
+ """
65
+ A filter that combines multiple filters using AND logic.
66
+ The filters are specified in a query string format after 'multi_filter::'
67
+ Example: multi_filter::high_rating&thinking_model&tag::tag_name
68
+
69
+ Ampersands in filter IDs can be escaped with a backslash.
70
+ """
71
+
72
+ PREFIX: ClassVar[str] = "multi_filter::"
73
+ ESCAPED_AMPERSAND: ClassVar[str] = r"\&"
74
+ UNESCAPED_AMPERSAND: ClassVar[str] = "&"
75
+
76
+ @classmethod
77
+ def parse_filter_string(cls, filter_string: str) -> List[str]:
78
+ """
79
+ Parse a filter string into individual filter IDs, handling escaped ampersands.
80
+ """
81
+ if not filter_string.startswith(cls.PREFIX):
82
+ raise ValueError(f"Filter string must start with {cls.PREFIX}")
83
+
84
+ # Remove the prefix
85
+ content = filter_string[len(cls.PREFIX) :]
86
+ if not content:
87
+ raise ValueError("No filters specified after prefix")
88
+
89
+ # Split on unescaped ampersands
90
+ # This regex matches & that are not preceded by a backslash
91
+ parts = re.split(r"(?<!\\)&", content)
92
+
93
+ # Unescape ampersands in each part
94
+ filter_ids = [
95
+ part.replace(cls.ESCAPED_AMPERSAND, cls.UNESCAPED_AMPERSAND)
96
+ for part in parts
97
+ ]
98
+
99
+ # Validate each filter ID using the existing validation
100
+ for fid in filter_ids:
101
+ _check_dataset_filter_id(fid)
102
+
103
+ return filter_ids
104
+
105
+ @classmethod
106
+ def is_valid_filter_string(cls, filter_string: str) -> bool:
107
+ """Check if a filter string is valid."""
108
+ try:
109
+ cls.parse_filter_string(filter_string)
110
+ return True
111
+ except ValueError:
112
+ return False
113
+
114
+ def __init__(self, filter_id: str):
115
+ filter_ids = MultiDatasetFilter.parse_filter_string(filter_id)
116
+ self.filters = [dataset_filter_from_id(fid) for fid in filter_ids]
117
+
118
+ def __call__(self, task_run: TaskRun) -> bool:
119
+ return all(f(task_run) for f in self.filters)
120
+
121
+
62
122
  class StaticDatasetFilters(str, Enum):
63
123
  """Dataset filter names."""
64
124
 
@@ -98,6 +158,11 @@ def _check_dataset_filter_id(id: str) -> str:
98
158
  if id.startswith("tag::") and len(id) > 5:
99
159
  return id
100
160
 
161
+ if id.startswith(MultiDatasetFilter.PREFIX):
162
+ if not MultiDatasetFilter.is_valid_filter_string(id):
163
+ raise ValueError(f"Invalid multi-filter string: {id}")
164
+ return id
165
+
101
166
  raise ValueError(f"Invalid dataset filter ID: {id}")
102
167
 
103
168
 
@@ -108,6 +173,9 @@ def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter:
108
173
  if id.startswith("tag::") and len(id) > 5:
109
174
  return TagFilter(id[5:])
110
175
 
176
+ if id.startswith(MultiDatasetFilter.PREFIX):
177
+ return MultiDatasetFilter(id)
178
+
111
179
  if id in static_dataset_filters:
112
180
  return static_dataset_filters[id]
113
181
 
@@ -45,6 +45,10 @@ Train80Test20SplitDefinition: list[DatasetSplitDefinition] = [
45
45
  DatasetSplitDefinition(name="train", percentage=0.8),
46
46
  DatasetSplitDefinition(name="test", percentage=0.2),
47
47
  ]
48
+ Train80Val20SplitDefinition: list[DatasetSplitDefinition] = [
49
+ DatasetSplitDefinition(name="train", percentage=0.8),
50
+ DatasetSplitDefinition(name="val", percentage=0.2),
51
+ ]
48
52
  Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
49
53
  DatasetSplitDefinition(name="train", percentage=0.6),
50
54
  DatasetSplitDefinition(name="test", percentage=0.2),
kiln_ai/datamodel/eval.py CHANGED
@@ -263,6 +263,10 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
263
263
  default=None,
264
264
  description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
265
265
  )
266
+ current_run_config_id: ID_TYPE = Field(
267
+ default=None,
268
+ description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.",
269
+ )
266
270
  eval_set_filter_id: DatasetFilterId = Field(
267
271
  description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
268
272
  )
@@ -272,6 +276,10 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
272
276
  output_scores: List[EvalOutputScore] = Field(
273
277
  description="The scores this evaluator should produce."
274
278
  )
279
+ favourite: bool = Field(
280
+ default=False,
281
+ description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
282
+ )
275
283
 
276
284
  # Workaround to return typed parent without importing Task
277
285
  def parent_task(self) -> Union["Task", None]:
@@ -5,7 +5,8 @@ from typing_extensions import Self
5
5
 
6
6
  from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel
7
7
  from kiln_ai.datamodel.datamodel_enums import (
8
- FinetuneDataStrategy,
8
+ THINKING_DATA_STRATEGIES,
9
+ ChatStrategy,
9
10
  FineTuneStatusType,
10
11
  StructuredOutputMode,
11
12
  )
@@ -13,6 +14,11 @@ from kiln_ai.datamodel.datamodel_enums import (
13
14
  if TYPE_CHECKING:
14
15
  from kiln_ai.datamodel.task import Task
15
16
 
17
+ DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS = [
18
+ ChatStrategy.two_message_cot_legacy,
19
+ ChatStrategy.two_message_cot,
20
+ ]
21
+
16
22
 
17
23
  class Finetune(KilnParentedModel):
18
24
  """
@@ -75,8 +81,8 @@ class Finetune(KilnParentedModel):
75
81
  default={},
76
82
  description="Properties of the fine-tune. Different providers may use different properties.",
77
83
  )
78
- data_strategy: FinetuneDataStrategy = Field(
79
- default=FinetuneDataStrategy.final_only,
84
+ data_strategy: ChatStrategy = Field(
85
+ default=ChatStrategy.single_turn,
80
86
  description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
81
87
  )
82
88
 
@@ -90,16 +96,16 @@ class Finetune(KilnParentedModel):
90
96
  def validate_thinking_instructions(self) -> Self:
91
97
  if (
92
98
  self.thinking_instructions is not None
93
- and self.data_strategy != FinetuneDataStrategy.final_and_intermediate
99
+ and self.data_strategy not in DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS
94
100
  ):
95
101
  raise ValueError(
96
- "Thinking instructions can only be used when data_strategy is final_and_intermediate"
102
+ f"Thinking instructions can only be used when data_strategy is one of the following: {DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS}"
97
103
  )
98
104
  if (
99
105
  self.thinking_instructions is None
100
- and self.data_strategy == FinetuneDataStrategy.final_and_intermediate
106
+ and self.data_strategy in DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS
101
107
  ):
102
108
  raise ValueError(
103
- "Thinking instructions are required when data_strategy is final_and_intermediate"
109
+ f"Thinking instructions are required when data_strategy is one of the following: {DATA_STRATIGIES_REQUIRED_THINKING_INSTRUCTIONS}"
104
110
  )
105
111
  return self
@@ -13,6 +13,7 @@ class PromptGenerators(str, Enum):
13
13
  SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder"
14
14
  FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder"
15
15
  MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder"
16
+ SHORT = "short_prompt_builder"
16
17
 
17
18
 
18
19
  prompt_generator_values = [pg.value for pg in PromptGenerators]
kiln_ai/datamodel/task.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING, Dict, List, Union
2
2
 
3
- from pydantic import BaseModel, Field
3
+ from pydantic import BaseModel, Field, ValidationInfo, model_validator
4
+ from typing_extensions import Self
4
5
 
5
6
  from kiln_ai.datamodel import Finetune
6
7
  from kiln_ai.datamodel.basemodel import (
@@ -11,7 +12,12 @@ from kiln_ai.datamodel.basemodel import (
11
12
  KilnParentedModel,
12
13
  KilnParentModel,
13
14
  )
14
- from kiln_ai.datamodel.datamodel_enums import Priority, TaskOutputRatingType
15
+ from kiln_ai.datamodel.datamodel_enums import (
16
+ ModelProviderName,
17
+ Priority,
18
+ StructuredOutputMode,
19
+ TaskOutputRatingType,
20
+ )
15
21
  from kiln_ai.datamodel.dataset_split import DatasetSplit
16
22
  from kiln_ai.datamodel.eval import Eval
17
23
  from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
@@ -47,12 +53,33 @@ class RunConfigProperties(BaseModel):
47
53
  """
48
54
 
49
55
  model_name: str = Field(description="The model to use for this run config.")
50
- model_provider_name: str = Field(
56
+ model_provider_name: ModelProviderName = Field(
51
57
  description="The provider to use for this run config."
52
58
  )
53
59
  prompt_id: PromptId = Field(
54
60
  description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.",
55
61
  )
62
+ top_p: float = Field(
63
+ default=1.0,
64
+ description="The top-p value to use for this run config. Defaults to 1.0.",
65
+ )
66
+ temperature: float = Field(
67
+ default=1.0,
68
+ description="The temperature to use for this run config. Defaults to 1.0.",
69
+ )
70
+ structured_output_mode: StructuredOutputMode = Field(
71
+ description="The structured output mode to use for this run config.",
72
+ )
73
+
74
+ @model_validator(mode="after")
75
+ def validate_required_fields(self) -> Self:
76
+ if not (0 <= self.top_p <= 1):
77
+ raise ValueError("top_p must be between 0 and 1")
78
+
79
+ elif self.temperature < 0 or self.temperature > 2:
80
+ raise ValueError("temperature must be between 0 and 2")
81
+
82
+ return self
56
83
 
57
84
 
58
85
  class RunConfig(RunConfigProperties):
@@ -101,12 +128,46 @@ class TaskRunConfig(KilnParentedModel):
101
128
  parent_task = self.parent_task()
102
129
  if parent_task is None:
103
130
  raise ValueError("Run config must be parented to a task")
104
- return RunConfig(
131
+ return run_config_from_run_config_properties(
105
132
  task=parent_task,
106
- model_name=self.run_config_properties.model_name,
107
- model_provider_name=self.run_config_properties.model_provider_name,
108
- prompt_id=self.run_config_properties.prompt_id,
133
+ run_config_properties=self.run_config_properties,
134
+ )
135
+
136
+ # Previously we didn't store structured_output_mode in the run_config_properties. Updgrade old models when loading from file.
137
+ @model_validator(mode="before")
138
+ def upgrade_old_entries(cls, data: dict, info: ValidationInfo) -> dict:
139
+ if not info.context or not info.context.get("loading_from_file", False):
140
+ # Not loading from file, so no need to upgrade
141
+ return data
142
+
143
+ if not isinstance(data, dict):
144
+ return data
145
+
146
+ structured_output_mode = data.get("run_config_properties", {}).get(
147
+ "structured_output_mode", None
109
148
  )
149
+ if structured_output_mode is None and "run_config_properties" in data:
150
+ # Default to unknown. Adapter will have to guess at runtime.
151
+ data["run_config_properties"]["structured_output_mode"] = (
152
+ StructuredOutputMode.unknown
153
+ )
154
+
155
+ return data
156
+
157
+
158
+ def run_config_from_run_config_properties(
159
+ task: "Task",
160
+ run_config_properties: RunConfigProperties,
161
+ ) -> RunConfig:
162
+ return RunConfig(
163
+ task=task,
164
+ model_name=run_config_properties.model_name,
165
+ model_provider_name=run_config_properties.model_provider_name,
166
+ prompt_id=run_config_properties.prompt_id,
167
+ top_p=run_config_properties.top_p,
168
+ temperature=run_config_properties.temperature,
169
+ structured_output_mode=run_config_properties.structured_output_mode,
170
+ )
110
171
 
111
172
 
112
173
  class Task(
@@ -64,7 +64,7 @@ class TaskOutputRating(KilnBaseModel):
64
64
  )
65
65
  requirement_ratings: Dict[ID_TYPE, RequirementRating] = Field(
66
66
  default={},
67
- description="The ratings of the requirements of the task.",
67
+ description="The ratings of the requirements of the task. The ID can be either a task_requirement_id or a named rating for an eval_output_score name (in format 'named::<name>').",
68
68
  )
69
69
 
70
70
  # Previously we stored rating values as a dict of floats, but now we store them as RequirementRating objects.