PyPI - kiln-ai - Versions diffs - 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

kiln-ai 0.15.0py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (72) hide show

kiln_ai/adapters/__init__.py +2 -0
kiln_ai/adapters/adapter_registry.py +22 -44
kiln_ai/adapters/chat/__init__.py +8 -0
kiln_ai/adapters/chat/chat_formatter.py +234 -0
kiln_ai/adapters/chat/test_chat_formatter.py +131 -0
kiln_ai/adapters/data_gen/test_data_gen_task.py +19 -6
kiln_ai/adapters/eval/base_eval.py +8 -6
kiln_ai/adapters/eval/eval_runner.py +9 -65
kiln_ai/adapters/eval/g_eval.py +26 -8
kiln_ai/adapters/eval/test_base_eval.py +166 -15
kiln_ai/adapters/eval/test_eval_runner.py +3 -0
kiln_ai/adapters/eval/test_g_eval.py +1 -0
kiln_ai/adapters/fine_tune/base_finetune.py +2 -2
kiln_ai/adapters/fine_tune/dataset_formatter.py +153 -197
kiln_ai/adapters/fine_tune/test_base_finetune.py +10 -10
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +402 -211
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +3 -3
kiln_ai/adapters/fine_tune/test_openai_finetune.py +6 -6
kiln_ai/adapters/fine_tune/test_together_finetune.py +1 -0
kiln_ai/adapters/fine_tune/test_vertex_finetune.py +4 -4
kiln_ai/adapters/fine_tune/together_finetune.py +12 -1
kiln_ai/adapters/ml_model_list.py +556 -45
kiln_ai/adapters/model_adapters/base_adapter.py +100 -35
kiln_ai/adapters/model_adapters/litellm_adapter.py +116 -100
kiln_ai/adapters/model_adapters/litellm_config.py +3 -2
kiln_ai/adapters/model_adapters/test_base_adapter.py +299 -52
kiln_ai/adapters/model_adapters/test_litellm_adapter.py +121 -22
kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +44 -2
kiln_ai/adapters/model_adapters/test_structured_output.py +48 -18
kiln_ai/adapters/parsers/base_parser.py +0 -3
kiln_ai/adapters/parsers/parser_registry.py +5 -3
kiln_ai/adapters/parsers/r1_parser.py +17 -2
kiln_ai/adapters/parsers/request_formatters.py +40 -0
kiln_ai/adapters/parsers/test_parser_registry.py +2 -2
kiln_ai/adapters/parsers/test_r1_parser.py +44 -1
kiln_ai/adapters/parsers/test_request_formatters.py +76 -0
kiln_ai/adapters/prompt_builders.py +14 -17
kiln_ai/adapters/provider_tools.py +39 -4
kiln_ai/adapters/repair/test_repair_task.py +27 -5
kiln_ai/adapters/test_adapter_registry.py +88 -28
kiln_ai/adapters/test_ml_model_list.py +158 -0
kiln_ai/adapters/test_prompt_adaptors.py +17 -3
kiln_ai/adapters/test_prompt_builders.py +27 -19
kiln_ai/adapters/test_provider_tools.py +130 -12
kiln_ai/datamodel/__init__.py +2 -2
kiln_ai/datamodel/datamodel_enums.py +43 -4
kiln_ai/datamodel/dataset_filters.py +69 -1
kiln_ai/datamodel/dataset_split.py +4 -0
kiln_ai/datamodel/eval.py +8 -0
kiln_ai/datamodel/finetune.py +13 -7
kiln_ai/datamodel/prompt_id.py +1 -0
kiln_ai/datamodel/task.py +68 -7
kiln_ai/datamodel/task_output.py +1 -1
kiln_ai/datamodel/task_run.py +39 -7
kiln_ai/datamodel/test_basemodel.py +5 -8
kiln_ai/datamodel/test_dataset_filters.py +82 -0
kiln_ai/datamodel/test_dataset_split.py +2 -8
kiln_ai/datamodel/test_example_models.py +54 -0
kiln_ai/datamodel/test_models.py +80 -9
kiln_ai/datamodel/test_task.py +168 -2
kiln_ai/utils/async_job_runner.py +106 -0
kiln_ai/utils/config.py +3 -2
kiln_ai/utils/dataset_import.py +81 -19
kiln_ai/utils/logging.py +165 -0
kiln_ai/utils/test_async_job_runner.py +199 -0
kiln_ai/utils/test_config.py +23 -0
kiln_ai/utils/test_dataset_import.py +272 -10
{kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/METADATA +1 -1
kiln_ai-0.17.0.dist-info/RECORD +113 -0
kiln_ai-0.15.0.dist-info/RECORD +0 -104
{kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.15.0.dist-info → kiln_ai-0.17.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/datamodel/task_run.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Dict, List, Union
 import jsonschema
 import jsonschema.exceptions
-from pydantic import Field, ValidationInfo, model_validator
+from pydantic import BaseModel, Field, ValidationInfo, model_validator
 from typing_extensions import Self
 from kiln_ai.datamodel.basemodel import KilnParentedModel
@@ -15,6 +15,29 @@ if TYPE_CHECKING:
     from kiln_ai.datamodel.task import Task
+class Usage(BaseModel):
+    input_tokens: int | None = Field(
+        default=None,
+        description="The number of input tokens used in the task run.",
+        ge=0,
+    )
+    output_tokens: int | None = Field(
+        default=None,
+        description="The number of output tokens used in the task run.",
+        ge=0,
+    )
+    total_tokens: int | None = Field(
+        default=None,
+        description="The total number of tokens used in the task run.",
+        ge=0,
+    )
+    cost: float | None = Field(
+        default=None,
+        description="The cost of the task run in US dollars, saved at runtime (prices can change over time).",
+        ge=0,
+    )
 class TaskRun(KilnParentedModel):
     """
     Represents a single execution of a Task.
@@ -47,17 +70,26 @@ class TaskRun(KilnParentedModel):
         default=[],
         description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
     )
+    usage: Usage | None = Field(
+        default=None,
+        description="Usage information for the task run. This includes the number of input tokens, output tokens, and total tokens used.",
+    )
+    def thinking_training_data(self) -> str | None:
+        """
+        Get the thinking training data from the task run.
+        """
+        if self.intermediate_outputs is None:
+            return None
+        return self.intermediate_outputs.get(
+            "reasoning"
+        ) or self.intermediate_outputs.get("chain_of_thought")
     def has_thinking_training_data(self) -> bool:
         """
         Does this run have thinking data that we can use to train a thinking model?
         """
-        if self.intermediate_outputs is None:
-            return False
-        return (
-            "chain_of_thought" in self.intermediate_outputs
-            or "reasoning" in self.intermediate_outputs
-        )
+        return self.thinking_training_data() is not None
     # Workaround to return typed parent without importing Task
     def parent_task(self) -> Union["Task", None]:

kiln_ai/datamodel/test_basemodel.py CHANGED Viewed

@@ -483,7 +483,7 @@ class MockAdapter(BaseAdapter):
     """Implementation of BaseAdapter for testing"""
     async def _run(self, input):
-        return RunOutput(output="test output", intermediate_outputs=None)
+        return RunOutput(output="test output", intermediate_outputs=None), None
     def adapter_name(self) -> str:
         return "test"
@@ -500,8 +500,9 @@ def adapter(base_task):
         run_config=RunConfig(
             task=base_task,
             model_name="test_model",
-            model_provider_name="test_provider",
+            model_provider_name="openai",
             prompt_id="simple_prompt_builder",
+            structured_output_mode="json_schema",
         ),
     )
@@ -510,6 +511,7 @@ async def test_invoke_parsing_flow(adapter):
     # Mock dependencies
     mock_provider = MagicMock()
     mock_provider.parser = "test_parser"
+    mock_provider.formatter = None
     mock_provider.reasoning_capable = False
     mock_parser = MagicMock()
@@ -517,13 +519,11 @@ async def test_invoke_parsing_flow(adapter):
         output="parsed test output", intermediate_outputs={"key": "value"}
     )
-    mock_parser_class = MagicMock(return_value=mock_parser)
     with (
         patch.object(adapter, "model_provider", return_value=mock_provider),
         patch(
             "kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
-            return_value=mock_parser_class,
+            return_value=mock_parser,
         ),
         patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
     ):
@@ -534,9 +534,6 @@ async def test_invoke_parsing_flow(adapter):
         # Execute
         result = await adapter.invoke("test input")
-        # Verify parser was created correctly
-        mock_parser_class.assert_called_once_with(structured_output=False)
         # Verify parsing occurred
         mock_parser.parse_output.assert_called_once()
         parsed_args = mock_parser.parse_output.call_args[1]

kiln_ai/datamodel/test_dataset_filters.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from unittest.mock import Mock
 import pytest
 from pydantic import BaseModel
@@ -5,12 +7,14 @@ from kiln_ai.datamodel.dataset_filters import (
     AllDatasetFilter,
     DatasetFilterId,
     HighRatingDatasetFilter,
+    MultiDatasetFilter,
     StaticDatasetFilters,
     TagFilter,
     ThinkingModelDatasetFilter,
     ThinkingModelHighRatedFilter,
     dataset_filter_from_id,
 )
+from kiln_ai.datamodel.task_run import TaskRun
 # Note: Many more filter tests in test_dataset_split.py
@@ -69,3 +73,81 @@ def test_tag_filter(tag, expected_error, expected_tag):
         filter = dataset_filter_from_id(tag)
         assert isinstance(filter, TagFilter)
         assert filter.tag == expected_tag
+class TestMultiDatasetFilter:
+    @pytest.mark.parametrize(
+        "filter_string,expected_filters",
+        [
+            ("multi_filter::high_rating", ["high_rating"]),
+            (
+                "multi_filter::high_rating&thinking_model",
+                ["high_rating", "thinking_model"],
+            ),
+            ("multi_filter::tag::test&high_rating", ["tag::test", "high_rating"]),
+            (
+                "multi_filter::high_rating&tag::tag\\&name",
+                ["high_rating", "tag::tag&name"],
+            ),
+        ],
+    )
+    def test_valid_filter_string_parsing(self, filter_string, expected_filters):
+        """Test that valid filter strings are parsed correctly."""
+        assert MultiDatasetFilter.parse_filter_string(filter_string) == expected_filters
+        assert MultiDatasetFilter.is_valid_filter_string(filter_string)
+    @pytest.mark.parametrize(
+        "filter_string,expected_error",
+        [
+            (
+                "not_multi_filter::high_rating",
+                "Filter string must start with multi_filter::",
+            ),
+            ("multi_filter::", "No filters specified after prefix"),
+            ("multi_filter::high_rating&", "Invalid dataset filter ID:"),
+            ("multi_filter::invalid_filter", "Invalid dataset filter ID:"),
+        ],
+    )
+    def test_invalid_filter_string_handling(self, filter_string, expected_error):
+        """Test that invalid filter strings raise appropriate errors."""
+        with pytest.raises(ValueError, match=expected_error):
+            MultiDatasetFilter.parse_filter_string(filter_string)
+        assert not MultiDatasetFilter.is_valid_filter_string(filter_string)
+    def test_filter_combination_logic(self):
+        """Test that multiple filters are combined with AND logic."""
+        # Create a mock task run
+        task_run = Mock(spec=TaskRun)
+        task_run.output = Mock()
+        task_run.output.rating = Mock()
+        task_run.output.rating.is_high_quality.return_value = True
+        task_run.tags = ["test_tag"]
+        task_run.has_thinking_training_data.return_value = True
+        task_run.repaired_output = None
+        # Test combining high_rating and tag filters
+        filter_id = "multi_filter::high_rating&tag::test_tag"
+        multi_filter = dataset_filter_from_id(filter_id)
+        assert multi_filter(task_run)
+        # Test that it fails if one filter fails
+        task_run.tags = ["wrong_tag"]
+        assert not multi_filter(task_run)
+        task_run.tags = ["test_tag"]
+        assert multi_filter(task_run)
+        task_run.output.rating.is_high_quality.return_value = False
+        assert not multi_filter(task_run)
+        # Verify the mock was called as expected
+        task_run.output.rating.is_high_quality.assert_called()
+    def test_filter_creation_from_id(self):
+        """Test that multi filters can be created via dataset_filter_from_id."""
+        filter_id = "multi_filter::high_rating&thinking_model"
+        filter = dataset_filter_from_id(filter_id)
+        assert isinstance(filter, MultiDatasetFilter)
+        assert len(filter.filters) == 2
+        assert any(isinstance(f, type(HighRatingDatasetFilter)) for f in filter.filters)
+        assert any(
+            isinstance(f, type(ThinkingModelDatasetFilter)) for f in filter.filters
+        )

kiln_ai/datamodel/test_dataset_split.py CHANGED Viewed

@@ -17,6 +17,7 @@ from kiln_ai.datamodel.dataset_split import (
     AllSplitDefinition,
     Train60Test20Val20SplitDefinition,
     Train80Test20SplitDefinition,
+    Train80Val20SplitDefinition,
 )
 from kiln_ai.datamodel.test_dataset_filters import (
     AllDatasetFilter,
@@ -71,14 +72,6 @@ def sample_task_runs(sample_task):
     return task_runs
-@pytest.fixture
-def standard_splitstandard_splitss():
-    return [
-        DatasetSplitDefinition(name="train", percentage=0.8),
-        DatasetSplitDefinition(name="test", percentage=0.2),
-    ]
 @pytest.fixture
 def task_run():
     return TaskRun(
@@ -174,6 +167,7 @@ def test_high_rating_dataset_filter(sample_task_runs):
     [
         (Train80Test20SplitDefinition, {"train": 8, "test": 2}),
         (AllSplitDefinition, {"all": 10}),
+        (Train80Val20SplitDefinition, {"train": 8, "val": 2}),
         (Train60Test20Val20SplitDefinition, {"train": 6, "test": 2, "val": 2}),
         (
             [

kiln_ai/datamodel/test_example_models.py CHANGED Viewed

@@ -16,6 +16,7 @@ from kiln_ai.datamodel import (
     TaskOutputRatingType,
     TaskRequirement,
     TaskRun,
+    Usage,
 )
@@ -743,3 +744,56 @@ def test_task_run_validate_repaired_output_structured(tmp_path):
                 ),
             ),
         )
+@pytest.mark.parametrize(
+    "input_tokens,output_tokens,total_tokens,cost,should_raise",
+    [
+        # Valid cases
+        (100, 50, 150, 0.002, False),  # All fields
+        (None, None, None, None, False),  # All None (defaults)
+        # Invalid cases
+        (-100, 50, 150, 0.002, True),  # Negative input_tokens
+        (100, -50, 150, 0.002, True),  # Negative output_tokens
+        (100, 50, -150, 0.002, True),  # Negative total_tokens
+        (100, 50, 150, -0.002, True),  # Negative cost
+    ],
+)
+def test_usage_model(input_tokens, output_tokens, total_tokens, cost, should_raise):
+    """Test the Usage model with various input combinations."""
+    if should_raise:
+        with pytest.raises(ValidationError):
+            Usage(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                total_tokens=total_tokens,
+                cost=cost,
+            )
+    else:
+        usage = Usage(
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            total_tokens=total_tokens,
+            cost=cost,
+        )
+        assert usage.input_tokens == input_tokens
+        assert usage.output_tokens == output_tokens
+        assert usage.total_tokens == total_tokens
+        assert usage.cost == cost
+def test_usage_model_in_task_run(valid_task_run):
+    """Test that Usage can be properly set in a TaskRun."""
+    usage = Usage(
+        input_tokens=100,
+        output_tokens=50,
+        total_tokens=150,
+        cost=0.002,
+    )
+    task_run = valid_task_run.model_copy(deep=True)
+    task_run.usage = usage
+    assert task_run.usage == usage
+    assert task_run.usage.input_tokens == 100
+    assert task_run.usage.output_tokens == 50
+    assert task_run.usage.total_tokens == 150
+    assert task_run.usage.cost == 0.002

kiln_ai/datamodel/test_models.py CHANGED Viewed

@@ -9,13 +9,13 @@ from kiln_ai.datamodel import (
     DataSource,
     DataSourceType,
     Finetune,
-    FinetuneDataStrategy,
     Project,
     Prompt,
     Task,
     TaskOutput,
     TaskRun,
 )
+from kiln_ai.datamodel.datamodel_enums import ChatStrategy
 from kiln_ai.datamodel.test_json_schema import json_joke_schema
@@ -536,30 +536,58 @@ def test_prompt_parent_task():
         # Test 1: Valid case - no thinking instructions with final_only
         (
             None,
-            FinetuneDataStrategy.final_only,
+            ChatStrategy.single_turn,
             False,
             None,
         ),
         # Test 2: Valid case - thinking instructions with final_and_intermediate
         (
             "Think step by step",
-            FinetuneDataStrategy.final_and_intermediate,
+            ChatStrategy.two_message_cot_legacy,
             False,
             None,
         ),
-        # Test 3: Invalid case - thinking instructions with final_only
+        # Test 3: Valid case - no thinking instructions with final_and_intermediate_r1_compatible
+        (
+            None,
+            ChatStrategy.single_turn_r1_thinking,
+            False,
+            None,
+        ),
+        # Test 4: Invalid case - thinking instructions with final_only
+        (
+            "Think step by step",
+            ChatStrategy.single_turn,
+            True,
+            "Thinking instructions can only be used when data_strategy is",
+        ),
+        # Test 5: Invalid case - no thinking instructions with final_and_intermediate
+        (
+            None,
+            ChatStrategy.two_message_cot_legacy,
+            True,
+            "Thinking instructions are required when data_strategy is",
+        ),
+        # Test 6: Invalid case - thinking instructions with final_and_intermediate_r1_compatible
         (
             "Think step by step",
-            FinetuneDataStrategy.final_only,
+            ChatStrategy.single_turn_r1_thinking,
             True,
-            "Thinking instructions can only be used when data_strategy is final_and_intermediate",
+            "Thinking instructions can only be used when data_strategy is",
         ),
-        # Test 4: Invalid case - no thinking instructions with final_and_intermediate
+        # Test 7: new COT format
         (
+            "Think step by step",
+            ChatStrategy.two_message_cot,
+            False,
             None,
-            FinetuneDataStrategy.final_and_intermediate,
+        ),
+        # Test 8: new COT format
+        (
+            None,
+            ChatStrategy.two_message_cot,
             True,
-            "Thinking instructions are required when data_strategy is final_and_intermediate",
+            "Thinking instructions are required when data_strategy is",
         ),
     ],
 )
@@ -617,3 +645,46 @@ def test_task_run_has_thinking_training_data(intermediate_outputs, expected):
         intermediate_outputs=intermediate_outputs,
     )
     assert task_run.has_thinking_training_data() == expected
+@pytest.mark.parametrize(
+    "intermediate_outputs,expected",
+    [
+        # No intermediate outputs
+        (None, None),
+        # Empty intermediate outputs
+        ({}, None),
+        # Only chain_of_thought
+        ({"chain_of_thought": "thinking process"}, "thinking process"),
+        # Only reasoning
+        ({"reasoning": "reasoning process"}, "reasoning process"),
+        # Both chain_of_thought and reasoning (should return reasoning as it's checked first)
+        (
+            {"chain_of_thought": "thinking process", "reasoning": "reasoning process"},
+            "reasoning process",
+        ),
+        # Other intermediate outputs but no thinking data
+        ({"other_output": "some data"}, None),
+        # Mixed other outputs with thinking data
+        (
+            {"chain_of_thought": "thinking process", "other_output": "some data"},
+            "thinking process",
+        ),
+    ],
+)
+def test_task_run_thinking_training_data(intermediate_outputs, expected):
+    task_run = TaskRun(
+        input="test input",
+        output=TaskOutput(output="test output"),
+        intermediate_outputs=intermediate_outputs,
+    )
+    assert task_run.thinking_training_data() == expected
+def test_chat_strategy_enum():
+    # This has to align to the old FinetuneDataStrategy enum
+    assert ChatStrategy.single_turn == "final_only"
+    assert ChatStrategy.two_message_cot_legacy == "final_and_intermediate"
+    assert (
+        ChatStrategy.single_turn_r1_thinking == "final_and_intermediate_r1_compatible"
+    )

kiln_ai/datamodel/test_task.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
-from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
+from kiln_ai.datamodel.datamodel_enums import StructuredOutputMode, TaskOutputRatingType
 from kiln_ai.datamodel.prompt_id import PromptGenerators
 from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, Task, TaskRunConfig
 from kiln_ai.datamodel.task_output import normalize_rating
@@ -15,6 +15,7 @@ def test_runconfig_valid_creation():
         model_name="gpt-4",
         model_provider_name="openai",
         prompt_id=PromptGenerators.SIMPLE,
+        structured_output_mode="json_schema",
     )
     assert config.task == task
@@ -29,12 +30,13 @@ def test_runconfig_missing_required_fields():
     errors = exc_info.value.errors()
     assert (
-        len(errors) == 4
+        len(errors) == 5
     )  # task, model_name, model_provider_name, and prompt_id are required
     assert any(error["loc"][0] == "task" for error in errors)
     assert any(error["loc"][0] == "model_name" for error in errors)
     assert any(error["loc"][0] == "model_provider_name" for error in errors)
     assert any(error["loc"][0] == "prompt_id" for error in errors)
+    assert any(error["loc"][0] == "structured_output_mode" for error in errors)
 def test_runconfig_custom_prompt_id():
@@ -45,6 +47,7 @@ def test_runconfig_custom_prompt_id():
         model_name="gpt-4",
         model_provider_name="openai",
         prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT,
+        structured_output_mode="json_schema",
     )
     assert config.prompt_id == PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT
@@ -61,6 +64,7 @@ def sample_run_config_props(sample_task):
         model_name="gpt-4",
         model_provider_name="openai",
         prompt_id=PromptGenerators.SIMPLE,
+        structured_output_mode="json_schema",
     )
@@ -157,3 +161,165 @@ def test_normalize_rating(rating_type, rating, expected):
 def test_normalize_rating_errors(rating_type, rating):
     with pytest.raises(ValueError):
         normalize_rating(rating, rating_type)
+def test_run_config_defaults():
+    """RunConfig should require top_p, temperature, and structured_output_mode to be set."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    config = RunConfig(
+        task=task,
+        model_name="gpt-4",
+        model_provider_name="openai",
+        prompt_id=PromptGenerators.SIMPLE,
+        structured_output_mode="json_schema",
+    )
+    assert config.top_p == 1.0
+    assert config.temperature == 1.0
+def test_run_config_valid_ranges():
+    """RunConfig should accept valid ranges for top_p and temperature."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    # Test valid values
+    config = RunConfig(
+        task=task,
+        model_name="gpt-4",
+        model_provider_name="openai",
+        prompt_id=PromptGenerators.SIMPLE,
+        top_p=0.9,
+        temperature=0.7,
+        structured_output_mode=StructuredOutputMode.json_schema,
+    )
+    assert config.top_p == 0.9
+    assert config.temperature == 0.7
+    assert config.structured_output_mode == StructuredOutputMode.json_schema
+@pytest.mark.parametrize("top_p", [0.0, 0.5, 1.0])
+def test_run_config_valid_top_p(top_p):
+    """Test that RunConfig accepts valid top_p values (0-1)."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    config = RunConfig(
+        task=task,
+        model_name="gpt-4",
+        model_provider_name="openai",
+        prompt_id=PromptGenerators.SIMPLE,
+        top_p=top_p,
+        temperature=1.0,
+        structured_output_mode=StructuredOutputMode.json_schema,
+    )
+    assert config.top_p == top_p
+@pytest.mark.parametrize("top_p", [-0.1, 1.1, 2.0])
+def test_run_config_invalid_top_p(top_p):
+    """Test that RunConfig rejects invalid top_p values."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    with pytest.raises(ValueError, match="top_p must be between 0 and 1"):
+        RunConfig(
+            task=task,
+            model_name="gpt-4",
+            model_provider_name="openai",
+            prompt_id=PromptGenerators.SIMPLE,
+            top_p=top_p,
+            temperature=1.0,
+            structured_output_mode=StructuredOutputMode.json_schema,
+        )
+@pytest.mark.parametrize("temperature", [0.0, 1.0, 2.0])
+def test_run_config_valid_temperature(temperature):
+    """Test that RunConfig accepts valid temperature values (0-2)."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    config = RunConfig(
+        task=task,
+        model_name="gpt-4",
+        model_provider_name="openai",
+        prompt_id=PromptGenerators.SIMPLE,
+        top_p=0.9,
+        temperature=temperature,
+        structured_output_mode=StructuredOutputMode.json_schema,
+    )
+    assert config.temperature == temperature
+@pytest.mark.parametrize("temperature", [-0.1, 2.1, 3.0])
+def test_run_config_invalid_temperature(temperature):
+    """Test that RunConfig rejects invalid temperature values."""
+    task = Task(id="task1", name="Test Task", instruction="Do something")
+    with pytest.raises(ValueError, match="temperature must be between 0 and 2"):
+        RunConfig(
+            task=task,
+            model_name="gpt-4",
+            model_provider_name="openai",
+            prompt_id=PromptGenerators.SIMPLE,
+            top_p=0.9,
+            temperature=temperature,
+            structured_output_mode=StructuredOutputMode.json_schema,
+        )
+def test_run_config_upgrade_old_entries():
+    """Test that TaskRunConfig parses old entries correctly with nested objects, filling in defaults where needed."""
+    data = {
+        "v": 1,
+        "name": "test name",
+        "created_at": "2025-06-09T13:33:35.276927",
+        "created_by": "scosman",
+        "run_config_properties": {
+            "model_name": "gpt_4_1_nano",
+            "model_provider_name": "openai",
+            "prompt_id": "task_run_config::189194447826::228174773209::244130257039",
+            "top_p": 0.77,
+            "temperature": 0.77,
+            "structured_output_mode": "json_instruction_and_object",
+        },
+        "prompt": {
+            "name": "Dazzling Unicorn",
+            "description": "Frozen copy of prompt 'simple_prompt_builder', created for evaluations.",
+            "generator_id": "simple_prompt_builder",
+            "prompt": "Generate a joke, given a theme. The theme will be provided as a word or phrase as the input to the model. The assistant should output a joke that is funny and relevant to the theme. If a style is provided, the joke should be in that style. The output should include a setup and punchline.\n\nYour response should respect the following requirements:\n1) Keep the joke on topic. If the user specifies a theme, the joke must be related to that theme.\n2) Avoid any jokes that are offensive or inappropriate. Keep the joke clean and appropriate for all audiences.\n3) Make the joke funny and engaging. It should be something that someone would want to tell to their friends. Something clever, not just a simple pun.\n",
+            "chain_of_thought_instructions": None,
+        },
+        "model_type": "task_run_config",
+    }
+    # Parse the data - this should be TaskRunConfig, not RunConfig
+    parsed = TaskRunConfig.model_validate(data)
+    assert parsed.name == "test name"
+    assert parsed.created_by == "scosman"
+    assert (
+        parsed.run_config_properties.structured_output_mode
+        == "json_instruction_and_object"
+    )
+    # should still work if loading from file
+    parsed = TaskRunConfig.model_validate(data, context={"loading_from_file": True})
+    assert parsed.name == "test name"
+    assert parsed.created_by == "scosman"
+    assert (
+        parsed.run_config_properties.structured_output_mode
+        == "json_instruction_and_object"
+    )
+    # Remove structured_output_mode from run_config_properties and parse again
+    del data["run_config_properties"]["structured_output_mode"]
+    with pytest.raises(ValidationError):
+        # should error if not loading from file
+        parsed = TaskRunConfig.model_validate(data)
+    parsed = TaskRunConfig.model_validate(data, context={"loading_from_file": True})
+    assert parsed.name == "test name"
+    assert parsed.created_by == "scosman"
+    assert parsed.run_config_properties.structured_output_mode == "unknown"

kiln-ai 0.15.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.15.0py3-none-any.whl → 0.17.0py3-none-any.whl