PyPI - kiln-ai - Versions diffs - 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

kiln-ai 0.8.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (88) hide show

kiln_ai/adapters/__init__.py +7 -7
kiln_ai/adapters/adapter_registry.py +81 -10
kiln_ai/adapters/data_gen/data_gen_task.py +21 -3
kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
kiln_ai/adapters/eval/base_eval.py +164 -0
kiln_ai/adapters/eval/eval_runner.py +267 -0
kiln_ai/adapters/eval/g_eval.py +367 -0
kiln_ai/adapters/eval/registry.py +16 -0
kiln_ai/adapters/eval/test_base_eval.py +324 -0
kiln_ai/adapters/eval/test_eval_runner.py +640 -0
kiln_ai/adapters/eval/test_g_eval.py +497 -0
kiln_ai/adapters/eval/test_g_eval_data.py +4 -0
kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +472 -129
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +114 -22
kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
kiln_ai/adapters/ml_model_list.py +434 -93
kiln_ai/adapters/model_adapters/__init__.py +18 -0
kiln_ai/adapters/model_adapters/base_adapter.py +250 -0
kiln_ai/adapters/model_adapters/langchain_adapters.py +309 -0
kiln_ai/adapters/model_adapters/openai_compatible_config.py +10 -0
kiln_ai/adapters/model_adapters/openai_model_adapter.py +289 -0
kiln_ai/adapters/model_adapters/test_base_adapter.py +199 -0
kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +105 -97
kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +216 -0
kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +80 -30
kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +125 -46
kiln_ai/adapters/ollama_tools.py +0 -1
kiln_ai/adapters/parsers/__init__.py +10 -0
kiln_ai/adapters/parsers/base_parser.py +12 -0
kiln_ai/adapters/parsers/json_parser.py +37 -0
kiln_ai/adapters/parsers/parser_registry.py +19 -0
kiln_ai/adapters/parsers/r1_parser.py +69 -0
kiln_ai/adapters/parsers/test_json_parser.py +81 -0
kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
kiln_ai/adapters/prompt_builders.py +193 -49
kiln_ai/adapters/provider_tools.py +91 -36
kiln_ai/adapters/repair/repair_task.py +18 -19
kiln_ai/adapters/repair/test_repair_task.py +7 -7
kiln_ai/adapters/run_output.py +11 -0
kiln_ai/adapters/test_adapter_registry.py +177 -0
kiln_ai/adapters/test_generate_docs.py +69 -0
kiln_ai/adapters/test_ollama_tools.py +0 -1
kiln_ai/adapters/test_prompt_adaptors.py +25 -18
kiln_ai/adapters/test_prompt_builders.py +265 -44
kiln_ai/adapters/test_provider_tools.py +268 -46
kiln_ai/datamodel/__init__.py +51 -772
kiln_ai/datamodel/basemodel.py +31 -11
kiln_ai/datamodel/datamodel_enums.py +58 -0
kiln_ai/datamodel/dataset_filters.py +114 -0
kiln_ai/datamodel/dataset_split.py +170 -0
kiln_ai/datamodel/eval.py +298 -0
kiln_ai/datamodel/finetune.py +105 -0
kiln_ai/datamodel/json_schema.py +14 -3
kiln_ai/datamodel/model_cache.py +8 -3
kiln_ai/datamodel/project.py +23 -0
kiln_ai/datamodel/prompt.py +37 -0
kiln_ai/datamodel/prompt_id.py +83 -0
kiln_ai/datamodel/strict_mode.py +24 -0
kiln_ai/datamodel/task.py +181 -0
kiln_ai/datamodel/task_output.py +321 -0
kiln_ai/datamodel/task_run.py +164 -0
kiln_ai/datamodel/test_basemodel.py +80 -2
kiln_ai/datamodel/test_dataset_filters.py +71 -0
kiln_ai/datamodel/test_dataset_split.py +127 -6
kiln_ai/datamodel/test_datasource.py +3 -2
kiln_ai/datamodel/test_eval_model.py +635 -0
kiln_ai/datamodel/test_example_models.py +34 -17
kiln_ai/datamodel/test_json_schema.py +23 -0
kiln_ai/datamodel/test_model_cache.py +24 -0
kiln_ai/datamodel/test_model_perf.py +125 -0
kiln_ai/datamodel/test_models.py +131 -2
kiln_ai/datamodel/test_prompt_id.py +129 -0
kiln_ai/datamodel/test_task.py +159 -0
kiln_ai/utils/config.py +6 -1
kiln_ai/utils/exhaustive_error.py +6 -0
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/METADATA +45 -7
kiln_ai-0.12.0.dist-info/RECORD +100 -0
kiln_ai/adapters/base_adapter.py +0 -191
kiln_ai/adapters/langchain_adapters.py +0 -256
kiln_ai-0.8.1.dist-info/RECORD +0 -58
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/WHEEL +0 -0
{kiln_ai-0.8.1.dist-info → kiln_ai-0.12.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/datamodel/test_example_models.py CHANGED Viewed

@@ -11,7 +11,6 @@ from kiln_ai.datamodel import (
     Finetune,
     Project,
     Task,
-    TaskDeterminism,
     TaskOutput,
     TaskOutputRating,
     TaskOutputRatingType,
@@ -125,7 +124,6 @@ def test_structured_output_workflow(tmp_path):
         name="Structured Output Task",
         parent=project,
         instruction="Generate a JSON object with name and age",
-        determinism=TaskDeterminism.semantic_match,
         output_json_schema=json.dumps(
             {
                 "type": "object",
@@ -142,7 +140,7 @@ def test_structured_output_workflow(tmp_path):
     # Create runs
     runs = []
-    for source in DataSourceType:
+    for source in [DataSourceType.human, DataSourceType.synthetic]:
         for _ in range(2):
             task_run = TaskRun(
                 input="Generate info for John Doe",
@@ -157,7 +155,7 @@ def test_structured_output_workflow(tmp_path):
                         "adapter_name": "TestAdapter",
                         "model_name": "GPT-4",
                         "model_provider": "OpenAI",
-                        "prompt_builder_name": "TestPromptBuilder",
+                        "prompt_id": "simple_prompt_builder",
                     },
                 ),
                 parent=task,
@@ -216,9 +214,9 @@ def test_structured_output_workflow(tmp_path):
     assert loaded_task.name == "Structured Output Task"
     assert len(loaded_task.requirements) == 2
-    assert len(loaded_task.runs()) == 5
     loaded_runs = loaded_task.runs()
+    assert len(loaded_runs) == 5
     for task_run in loaded_runs:
         output = task_run.output
         assert output.rating is not None
@@ -284,6 +282,9 @@ def test_task_output_requirement_rating_keys(tmp_path):
     assert task_run.output.rating.requirement_ratings is not None
+_schema_match = "This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema."
 def test_task_output_schema_validation(tmp_path):
     # Create a project, task, and example hierarchy
     project = Project(name="Test Project", path=(tmp_path / "test_project"))
@@ -321,12 +322,24 @@ def test_task_output_schema_validation(tmp_path):
     task_output.save_to_file()
     # changing to invalid output
-    with pytest.raises(ValueError, match="does not match task output schema"):
+    with pytest.raises(
+        ValueError,
+        match=_schema_match,
+    ):
         task_output.output.output = '{"name": "John Doe", "age": "thirty"}'
         task_output.save_to_file()
+    # changing to invalid output from loaded model
+    loaded_task_output = TaskRun.load_from_file(task_output.path)
+    with pytest.raises(
+        ValueError,
+        match=_schema_match,
+    ):
+        loaded_task_output.output.output = '{"name": "John Doe", "age": "forty"}'
+        loaded_task_output.save_to_file()
     # Invalid case: output does not match task output schema
-    with pytest.raises(ValueError, match="does not match task output schema"):
+    with pytest.raises(ValueError, match=_schema_match):
         task_output = TaskRun(
             input="Test input",
             input_source=DataSource(
@@ -382,12 +395,18 @@ def test_task_input_schema_validation(tmp_path):
     valid_task_output.save_to_file()
     # Changing to invalid input
-    with pytest.raises(ValueError, match="does not match task input schema"):
+    with pytest.raises(ValueError, match=_schema_match):
         valid_task_output.input = '{"name": "John Doe", "age": "thirty"}'
         valid_task_output.save_to_file()
+    # loading from file, then changing to invalid input
+    loaded_task_output = TaskRun.load_from_file(valid_task_output.path)
+    with pytest.raises(ValueError, match=_schema_match):
+        loaded_task_output.input = '{"name": "John Doe", "age": "thirty"}'
+        loaded_task_output.save_to_file()
     # Invalid case: input does not match task input schema
-    with pytest.raises(ValueError, match="does not match task input schema"):
+    with pytest.raises(ValueError, match=_schema_match):
         task_output = TaskRun(
             input='{"name": "John Doe", "age": "thirty"}',
             input_source=DataSource(
@@ -451,7 +470,7 @@ def test_valid_synthetic_task_output():
                 "adapter_name": "TestAdapter",
                 "model_name": "GPT-4",
                 "model_provider": "OpenAI",
-                "prompt_builder_name": "TestPromptBuilder",
+                "prompt_id": "simple_prompt_builder",
             },
         ),
     )
@@ -459,7 +478,7 @@ def test_valid_synthetic_task_output():
     assert output.source.properties["adapter_name"] == "TestAdapter"
     assert output.source.properties["model_name"] == "GPT-4"
     assert output.source.properties["model_provider"] == "OpenAI"
-    assert output.source.properties["prompt_builder_name"] == "TestPromptBuilder"
+    assert output.source.properties["prompt_id"] == "simple_prompt_builder"
 def test_invalid_synthetic_task_output_missing_keys():
@@ -488,23 +507,21 @@ def test_invalid_synthetic_task_output_empty_values():
                     "adapter_name": "TestAdapter",
                     "model_name": "",
                     "model_provider": "OpenAI",
-                    "prompt_builder_name": "TestPromptBuilder",
+                    "prompt_id": "simple_prompt_builder",
                 },
             ),
         )
 def test_invalid_synthetic_task_output_non_string_values():
-    with pytest.raises(
-        ValidationError, match="'prompt_builder_name' must be of type str"
-    ):
+    with pytest.raises(ValidationError, match="'prompt_id' must be of type str"):
         DataSource(
             type=DataSourceType.synthetic,
             properties={
                 "adapter_name": "TestAdapter",
                 "model_name": "GPT-4",
                 "model_provider": "OpenAI",
-                "prompt_builder_name": 123,
+                "prompt_id": 123,
             },
         )

kiln_ai/datamodel/test_json_schema.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pydantic import BaseModel
 from kiln_ai.datamodel.json_schema import (
     JsonObjectSchema,
     schema_from_json_str,
+    string_to_json_key,
     validate_schema,
 )
@@ -123,3 +124,25 @@ def test_triangle_schema():
     validate_schema({"a": 1, "b": 2, "c": 3}, json_triangle_schema)
     with pytest.raises(Exception):
         validate_schema({"a": 1, "b": 2, "c": "3"}, json_triangle_schema)
+@pytest.mark.parametrize(
+    "input_str,expected",
+    [
+        ("hello world", "hello_world"),
+        ("Hello World", "hello_world"),
+        ("hello_world", "hello_world"),
+        ("HELLO WORLD", "hello_world"),
+        ("hello123", "hello123"),
+        ("hello-world", "helloworld"),
+        ("hello!@#$%^&*()world", "helloworld"),
+        ("  hello  world  ", "hello__world"),
+        ("hello__world", "hello__world"),
+        ("", ""),
+        ("!@#$%", ""),
+        ("snake_case_string", "snake_case_string"),
+        ("camelCaseString", "camelcasestring"),
+    ],
+)
+def test_string_to_json_key(input_str: str, expected: str):
+    assert string_to_json_key(input_str) == expected

kiln_ai/datamodel/test_model_cache.py CHANGED Viewed

@@ -242,3 +242,27 @@ def test_check_timestamp_granularity_linux_error():
         cache = ModelCache()
         assert cache._check_timestamp_granularity() is False
         assert cache._enabled is False
+def test_get_model_readonly(model_cache, test_path):
+    if not model_cache._enabled:
+        pytest.skip("Cache is disabled on this fs")
+    model = ModelTest(name="test", value=123)
+    mtime_ns = test_path.stat().st_mtime_ns
+    # Set the model in the cache
+    model_cache.set_model(test_path, model, mtime_ns)
+    # Get the model in readonly mode
+    readonly_model = model_cache.get_model(test_path, ModelTest, readonly=True)
+    # Get a regular (copied) model
+    copied_model = model_cache.get_model(test_path, ModelTest)
+    # The readonly model should be the exact same instance as the cached model
+    assert readonly_model is model_cache.model_cache[test_path][0]
+    # While the regular get should be a different instance
+    assert copied_model is not model_cache.model_cache[test_path][0]
+    # Both should have the same data
+    assert readonly_model == copied_model == model

kiln_ai/datamodel/test_model_perf.py ADDED Viewed

@@ -0,0 +1,125 @@
+import shutil
+import uuid
+import pytest
+from kiln_ai.datamodel import (
+    DataSource,
+    DataSourceType,
+    Project,
+    Task,
+    TaskOutput,
+    TaskRun,
+)
+test_json_schema = """{
+  "type": "object",
+  "properties": {
+    "setup": {
+      "description": "The setup of the joke",
+      "title": "Setup",
+      "type": "string"
+    },
+    "punchline": {
+      "description": "The punchline to the joke",
+      "title": "Punchline",
+      "type": "string"
+    },
+    "rating": {
+      "anyOf": [
+        {
+          "type": "integer"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "How funny the joke is, from 1 to 10",
+      "title": "Rating"
+    }
+  },
+  "required": [
+    "setup",
+    "punchline"
+  ]
+}
+"""
+@pytest.fixture
+def task_run(tmp_path):
+    # setup a valid project/task/task_run for testing
+    output_source = DataSource(
+        type=DataSourceType.synthetic,
+        properties={
+            "model_name": "test-model",
+            "model_provider": "test-provider",
+            "adapter_name": "test-adapter",
+        },
+    )
+    project_path = tmp_path / "project.kiln"
+    project = Project(name="Test Project", path=project_path)
+    project.save_to_file()
+    task = Task(
+        name="Test Task",
+        instruction="Test Instruction",
+        parent=project,
+        output_json_schema=test_json_schema,
+        input_json_schema=test_json_schema,
+    )
+    task.save_to_file()
+    task_output = TaskOutput(
+        output='{"setup": "Why did the chicken cross the road?", "punchline": "To get to the other side"}',
+        source=DataSource(
+            type=DataSourceType.synthetic,
+            properties={
+                "model_name": "test-model",
+                "model_provider": "test-provider",
+                "adapter_name": "test-adapter",
+            },
+        ),
+    )
+    # Save for later usage
+    task_run = TaskRun(
+        input='{"setup": "Why did the chicken cross the road?", "punchline": "To get to the other side"}',
+        input_source=output_source,
+        output=task_output,
+    )
+    task_run.parent = task
+    task_run.save_to_file()
+    return task_run
+@pytest.mark.benchmark
+def test_benchmark_load_from_file(benchmark, task_run):
+    task_run_path = task_run.path
+    iterations = 500
+    total_time = 0
+    for _ in range(iterations):
+        # Copy the task to a new temp path, so we don't get warm loads/cached loads
+        temp_path = task_run.path.parent / f"temp_task_run_{uuid.uuid4()}.json"
+        shutil.copy(str(task_run_path), str(temp_path))
+        # only time loading the model (and one accessor for delayed validation)
+        start_time = benchmark._timer()
+        loaded = TaskRun.load_from_file(temp_path)
+        assert loaded.id == task_run.id
+        end_time = benchmark._timer()
+        total_time += end_time - start_time
+    avg_time_per_iteration = total_time / iterations
+    ops_per_second = 1.0 / avg_time_per_iteration
+    # I get 8k ops per second on my MBP. Lower value here for CI.
+    # Prior to optimization was 290 ops per second.
+    if ops_per_second < 1000:
+        pytest.fail(f"Ops per second: {ops_per_second:.6f}, expected more than 1k ops")

kiln_ai/datamodel/test_models.py CHANGED Viewed

@@ -9,7 +9,9 @@ from kiln_ai.datamodel import (
     DataSource,
     DataSourceType,
     Finetune,
+    FinetuneDataStrategy,
     Project,
+    Prompt,
     Task,
     TaskOutput,
     TaskRun,
@@ -70,6 +72,20 @@ def test_save_to_file(test_project_file):
     assert data["description"] == "Test Description"
+def test_save_to_file_non_ascii(test_project_file):
+    project = Project(
+        name="Test Project", description="Chúc mừng!", path=test_project_file
+    )
+    project.save_to_file()
+    with open(test_project_file, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    assert data["v"] == 1
+    assert data["name"] == "Test Project"
+    assert data["description"] == "Chúc mừng!"
 def test_task_defaults():
     task = Task(name="Test Task", instruction="Test Instruction")
     assert task.description is None
@@ -369,7 +385,7 @@ def test_task_run_input_source_validation(tmp_path):
     assert task_run.input_source is not None
     # Test 3: Creating without input_source should fail when strict mode is on
-    with patch("kiln_ai.datamodel.strict_mode", return_value=True):
+    with patch("kiln_ai.datamodel.task_run.strict_mode", return_value=True):
         with pytest.raises(ValueError) as exc_info:
             task_run = TaskRun(
                 input="test input 3",
@@ -426,7 +442,7 @@ def test_task_output_source_validation(tmp_path):
     assert task_output.source is not None
     # Test 3: Creating without source should fail when strict mode is on
-    with patch("kiln_ai.datamodel.strict_mode", return_value=True):
+    with patch("kiln_ai.datamodel.task_output.strict_mode", return_value=True):
         with pytest.raises(ValueError) as exc_info:
             task_output = TaskOutput(
                 output="test output 3",
@@ -488,3 +504,116 @@ def test_task_run_tags_validation():
             tags=["valid_tag", "invalid tag"],
         )
     assert "Tags cannot contain spaces. Try underscores." in str(exc_info.value)
+def test_prompt_validation():
+    prompt = Prompt(name="Test Prompt Name", prompt="Test Prompt")
+    assert prompt.name == "Test Prompt Name"
+    assert prompt.prompt == "Test Prompt"
+    with pytest.raises(ValidationError):
+        Prompt(name="Test Prompt")
+    with pytest.raises(ValidationError):
+        Prompt(name="Test Prompt", prompt=None)
+    with pytest.raises(ValidationError):
+        Prompt(name="Test Prompt", prompt="")
+    with pytest.raises(ValidationError):
+        Prompt(prompt="Test Prompt")
+def test_prompt_parent_task():
+    task = Task(name="Test Task", instruction="Test Instruction")
+    prompt = Prompt(name="Test Prompt", prompt="Test Prompt", parent=task)
+    assert prompt.parent == task
+@pytest.mark.parametrize(
+    "thinking_instructions,data_strategy,should_raise,expected_message",
+    [
+        # Test 1: Valid case - no thinking instructions with final_only
+        (
+            None,
+            FinetuneDataStrategy.final_only,
+            False,
+            None,
+        ),
+        # Test 2: Valid case - thinking instructions with final_and_intermediate
+        (
+            "Think step by step",
+            FinetuneDataStrategy.final_and_intermediate,
+            False,
+            None,
+        ),
+        # Test 3: Invalid case - thinking instructions with final_only
+        (
+            "Think step by step",
+            FinetuneDataStrategy.final_only,
+            True,
+            "Thinking instructions can only be used when data_strategy is final_and_intermediate",
+        ),
+        # Test 4: Invalid case - no thinking instructions with final_and_intermediate
+        (
+            None,
+            FinetuneDataStrategy.final_and_intermediate,
+            True,
+            "Thinking instructions are required when data_strategy is final_and_intermediate",
+        ),
+    ],
+)
+def test_finetune_thinking_instructions_validation(
+    thinking_instructions, data_strategy, should_raise, expected_message
+):
+    base_params = {
+        "name": "test-finetune",
+        "provider": "openai",
+        "base_model_id": "gpt-3.5-turbo",
+        "dataset_split_id": "split1",
+        "system_message": "test message",
+        "data_strategy": data_strategy,
+    }
+    if thinking_instructions is not None:
+        base_params["thinking_instructions"] = thinking_instructions
+    if should_raise:
+        with pytest.raises(ValueError) as exc_info:
+            Finetune(**base_params)
+        assert expected_message in str(exc_info.value)
+    else:
+        finetune = Finetune(**base_params)
+        assert finetune.thinking_instructions == thinking_instructions
+        assert finetune.data_strategy == data_strategy
+@pytest.mark.parametrize(
+    "intermediate_outputs,expected",
+    [
+        # No intermediate outputs
+        (None, False),
+        # Empty intermediate outputs
+        ({}, False),
+        # Only chain_of_thought
+        ({"chain_of_thought": "thinking process"}, True),
+        # Only reasoning
+        ({"reasoning": "reasoning process"}, True),
+        # Both chain_of_thought and reasoning
+        (
+            {"chain_of_thought": "thinking process", "reasoning": "reasoning process"},
+            True,
+        ),
+        # Other intermediate outputs but no thinking data
+        ({"other_output": "some data"}, False),
+        # Mixed other outputs with thinking data
+        ({"chain_of_thought": "thinking process", "other_output": "some data"}, True),
+    ],
+)
+def test_task_run_has_thinking_training_data(intermediate_outputs, expected):
+    task_run = TaskRun(
+        input="test input",
+        output=TaskOutput(output="test output"),
+        intermediate_outputs=intermediate_outputs,
+    )
+    assert task_run.has_thinking_training_data() == expected

kiln_ai/datamodel/test_prompt_id.py ADDED Viewed

@@ -0,0 +1,129 @@
+import pytest
+from pydantic import BaseModel, ValidationError
+from kiln_ai.datamodel import (
+    PromptGenerators,
+    PromptId,
+)
+from kiln_ai.datamodel.prompt_id import is_frozen_prompt
+# Test model to validate the PromptId type
+class ModelTester(BaseModel):
+    prompt_id: PromptId
+def test_valid_prompt_generator_names():
+    """Test that valid prompt generator names are accepted"""
+    for generator in PromptGenerators:
+        model = ModelTester(prompt_id=generator.value)
+        assert model.prompt_id == generator.value
+def test_valid_saved_prompt_id():
+    """Test that valid saved prompt IDs are accepted"""
+    valid_id = "id::prompt_789"
+    model = ModelTester(prompt_id=valid_id)
+    assert model.prompt_id == valid_id
+def test_valid_fine_tune_prompt_id():
+    """Test that valid fine-tune prompt IDs are accepted"""
+    valid_id = "fine_tune_prompt::ft_123456"
+    model = ModelTester(prompt_id=valid_id)
+    assert model.prompt_id == valid_id
+@pytest.mark.parametrize(
+    "invalid_id",
+    [
+        pytest.param("id::project_123::task_456", id="missing_prompt_id"),
+        pytest.param("id::task_456::prompt_789", id="too_many_parts"),
+        pytest.param("id::", id="empty_parts"),
+    ],
+)
+def test_invalid_saved_prompt_id_format(invalid_id):
+    """Test that invalid saved prompt ID formats are rejected"""
+    with pytest.raises(ValidationError, match="Invalid saved prompt ID"):
+        ModelTester(prompt_id=invalid_id)
+@pytest.mark.parametrize(
+    "invalid_id,expected_error",
+    [
+        ("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"),
+        ("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"),
+    ],
+)
+def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error):
+    """Test that invalid fine-tune prompt ID formats are rejected"""
+    with pytest.raises(ValidationError, match=expected_error):
+        ModelTester(prompt_id=invalid_id)
+def test_completely_invalid_formats():
+    """Test that completely invalid formats are rejected"""
+    invalid_ids = [
+        "",  # Empty string
+        "invalid_format",  # Random string
+        "id:wrong_format",  # Almost correct but wrong separator
+        "fine_tune:wrong_format",  # Almost correct but wrong prefix
+        ":::",  # Just separators
+    ]
+    for invalid_id in invalid_ids:
+        with pytest.raises(ValidationError, match="Invalid prompt ID"):
+            ModelTester(prompt_id=invalid_id)
+def test_prompt_generator_case_sensitivity():
+    """Test that prompt generator names are case sensitive"""
+    # Take first generator and modify its case
+    first_generator = next(iter(PromptGenerators)).value
+    wrong_case = first_generator.upper()
+    if wrong_case == first_generator:
+        wrong_case = first_generator.lower()
+    with pytest.raises(ValidationError):
+        ModelTester(prompt_id=wrong_case)
+@pytest.mark.parametrize(
+    "valid_id",
+    [
+        "task_run_config::project_123::task_456::config_123",  # Valid task run config prompt ID
+    ],
+)
+def test_valid_task_run_config_prompt_id(valid_id):
+    """Test that valid eval prompt IDs are accepted"""
+    model = ModelTester(prompt_id=valid_id)
+    assert model.prompt_id == valid_id
+@pytest.mark.parametrize(
+    "invalid_id,expected_error",
+    [
+        ("task_run_config::", "Invalid task run config prompt ID"),
+        ("task_run_config::p1", "Invalid task run config prompt ID"),
+        ("task_run_config::p1::t1", "Invalid task run config prompt ID"),
+        ("task_run_config::p1::t1::c1::extra", "Invalid task run config prompt ID"),
+    ],
+)
+def test_invalid_eval_prompt_id_format(invalid_id, expected_error):
+    """Test that invalid eval prompt ID formats are rejected"""
+    with pytest.raises(ValidationError, match=expected_error):
+        ModelTester(prompt_id=invalid_id)
+@pytest.mark.parametrize(
+    "id,should_be_frozen",
+    [
+        ("simple_prompt_builder", False),
+        ("id::prompt_123", True),
+        ("task_run_config::p1::t1", True),
+        ("fine_tune_prompt::ft_123", True),
+    ],
+)
+def test_is_frozen_prompt(id, should_be_frozen):
+    """Test that the is_frozen_prompt function works"""
+    assert is_frozen_prompt(id) == should_be_frozen

kiln-ai 0.8.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.8.1py3-none-any.whl → 0.12.0py3-none-any.whl