PyPI - kiln-ai - Versions diffs - 0.8.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

kiln-ai 0.8.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (57) hide show

kiln_ai/adapters/__init__.py +7 -7
kiln_ai/adapters/adapter_registry.py +77 -5
kiln_ai/adapters/data_gen/data_gen_task.py +3 -3
kiln_ai/adapters/data_gen/test_data_gen_task.py +23 -3
kiln_ai/adapters/fine_tune/base_finetune.py +5 -1
kiln_ai/adapters/fine_tune/dataset_formatter.py +310 -65
kiln_ai/adapters/fine_tune/fireworks_finetune.py +47 -32
kiln_ai/adapters/fine_tune/openai_finetune.py +12 -11
kiln_ai/adapters/fine_tune/test_base_finetune.py +19 -0
kiln_ai/adapters/fine_tune/test_dataset_formatter.py +469 -129
kiln_ai/adapters/fine_tune/test_fireworks_tinetune.py +113 -21
kiln_ai/adapters/fine_tune/test_openai_finetune.py +125 -14
kiln_ai/adapters/ml_model_list.py +323 -94
kiln_ai/adapters/model_adapters/__init__.py +18 -0
kiln_ai/adapters/{base_adapter.py → model_adapters/base_adapter.py} +81 -37
kiln_ai/adapters/{langchain_adapters.py → model_adapters/langchain_adapters.py} +130 -84
kiln_ai/adapters/model_adapters/openai_compatible_config.py +11 -0
kiln_ai/adapters/model_adapters/openai_model_adapter.py +246 -0
kiln_ai/adapters/model_adapters/test_base_adapter.py +190 -0
kiln_ai/adapters/{test_langchain_adapter.py → model_adapters/test_langchain_adapter.py} +103 -88
kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +225 -0
kiln_ai/adapters/{test_saving_adapter_results.py → model_adapters/test_saving_adapter_results.py} +43 -15
kiln_ai/adapters/{test_structured_output.py → model_adapters/test_structured_output.py} +93 -20
kiln_ai/adapters/parsers/__init__.py +10 -0
kiln_ai/adapters/parsers/base_parser.py +12 -0
kiln_ai/adapters/parsers/json_parser.py +37 -0
kiln_ai/adapters/parsers/parser_registry.py +19 -0
kiln_ai/adapters/parsers/r1_parser.py +69 -0
kiln_ai/adapters/parsers/test_json_parser.py +81 -0
kiln_ai/adapters/parsers/test_parser_registry.py +32 -0
kiln_ai/adapters/parsers/test_r1_parser.py +144 -0
kiln_ai/adapters/prompt_builders.py +126 -20
kiln_ai/adapters/provider_tools.py +91 -36
kiln_ai/adapters/repair/repair_task.py +17 -6
kiln_ai/adapters/repair/test_repair_task.py +4 -4
kiln_ai/adapters/run_output.py +8 -0
kiln_ai/adapters/test_adapter_registry.py +177 -0
kiln_ai/adapters/test_generate_docs.py +69 -0
kiln_ai/adapters/test_prompt_adaptors.py +8 -4
kiln_ai/adapters/test_prompt_builders.py +190 -29
kiln_ai/adapters/test_provider_tools.py +268 -46
kiln_ai/datamodel/__init__.py +199 -12
kiln_ai/datamodel/basemodel.py +31 -11
kiln_ai/datamodel/json_schema.py +8 -3
kiln_ai/datamodel/model_cache.py +8 -3
kiln_ai/datamodel/test_basemodel.py +81 -2
kiln_ai/datamodel/test_dataset_split.py +100 -3
kiln_ai/datamodel/test_example_models.py +25 -4
kiln_ai/datamodel/test_model_cache.py +24 -0
kiln_ai/datamodel/test_model_perf.py +125 -0
kiln_ai/datamodel/test_models.py +129 -0
kiln_ai/utils/exhaustive_error.py +6 -0
{kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/METADATA +9 -7
kiln_ai-0.11.1.dist-info/RECORD +76 -0
kiln_ai-0.8.0.dist-info/RECORD +0 -58
{kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/WHEEL +0 -0
{kiln_ai-0.8.0.dist-info → kiln_ai-0.11.1.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/datamodel/__init__.py CHANGED Viewed

@@ -49,11 +49,18 @@ __all__ = [
     "DataSource",
     "DataSourceType",
     "DataSourceProperty",
+    "Finetune",
+    "FineTuneStatusType",
     "TaskOutputRatingType",
     "TaskRequirement",
     "TaskDeterminism",
+    "DatasetSplitDefinition",
+    "DatasetSplit",
+    "RequirementRating",
+    "TaskRequirement",
     "strict_mode",
     "set_strict_mode",
+    "Prompt",
 ]
@@ -268,12 +275,47 @@ class FineTuneStatusType(str, Enum):
     failed = "failed"
+class StructuredOutputMode(str, Enum):
+    """
+    Enumeration of supported structured output modes.
+    - default: let the adapter decide
+    - json_schema: request json using API capabilities for json_schema
+    - function_calling: request json using API capabilities for function calling
+    - json_mode: request json using API's JSON mode, which should return valid JSON, but isn't checking/passing the schema
+    - json_instructions: append instructions to the prompt to request json matching the schema. No API capabilities are used. You should have a custom parser on these models as they will be returning strings.
+    - json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries).
+    """
+    default = "default"
+    json_schema = "json_schema"
+    function_calling = "function_calling"
+    json_mode = "json_mode"
+    json_instructions = "json_instructions"
+    json_instruction_and_object = "json_instruction_and_object"
+class FinetuneDataStrategy(str, Enum):
+    final_only = "final_only"
+    final_and_intermediate = "final_and_intermediate"
 class Finetune(KilnParentedModel):
+    """
+    The Kiln fine-tune datamodel.
+    Initially holds a reference to a training job, with needed identifiers to update the status. When complete, contains the new model ID.
+    """
     name: str = NAME_FIELD
     description: str | None = Field(
         default=None,
         description="A description of the fine-tune for you and your team. Not used in training.",
     )
+    structured_output_mode: StructuredOutputMode | None = Field(
+        default=None,
+        description="The mode to use to train the model for structured output, if it was trained with structured output. Will determine how we call the tuned model, so we call with the matching mode.",
+    )
     provider: str = Field(
         description="The provider to use for the fine-tune (e.g. 'openai')."
     )
@@ -303,9 +345,14 @@ class Finetune(KilnParentedModel):
         default={},
         description="The parameters to use for this fine-tune. These are provider-specific.",
     )
+    # These two fields are saved exactly used for training. Even if they map exactly to a custom prompt or generator, those can change, so we want to keep a record of the training prompt.
     system_message: str = Field(
         description="The system message to use for this fine-tune.",
     )
+    thinking_instructions: str | None = Field(
+        default=None,
+        description="The thinking instructions to use for this fine-tune. Only used when data_strategy is final_and_intermediate.",
+    )
     latest_status: FineTuneStatusType = Field(
         default=FineTuneStatusType.unknown,
         description="The latest known status of this fine-tune. Not updated in real time.",
@@ -314,12 +361,34 @@ class Finetune(KilnParentedModel):
         default={},
         description="Properties of the fine-tune. Different providers may use different properties.",
     )
+    data_strategy: FinetuneDataStrategy = Field(
+        default=FinetuneDataStrategy.final_only,
+        description="The strategy to use for training the model. 'final_only' will only train on the final response. 'final_and_intermediate' will train on the final response and intermediate outputs (chain of thought or reasoning).",
+    )
     def parent_task(self) -> Task | None:
         if not isinstance(self.parent, Task):
             return None
         return self.parent
+    @model_validator(mode="after")
+    def validate_thinking_instructions(self) -> Self:
+        if (
+            self.thinking_instructions is not None
+            and self.data_strategy != FinetuneDataStrategy.final_and_intermediate
+        ):
+            raise ValueError(
+                "Thinking instructions can only be used when data_strategy is final_and_intermediate"
+            )
+        if (
+            self.thinking_instructions is None
+            and self.data_strategy == FinetuneDataStrategy.final_and_intermediate
+        ):
+            raise ValueError(
+                "Thinking instructions are required when data_strategy is final_and_intermediate"
+            )
+        return self
 class DataSourceType(str, Enum):
     """
@@ -391,6 +460,13 @@ class DataSource(BaseModel):
             type=str,
             not_allowed_for=[DataSourceType.human],
         ),
+        DataSourceProperty(
+            # Optional: an ID within the scope of the prompt_builder_name.
+            # Used for prompt builders with IDs (like saved prompts, fine-tune prompts)
+            name="prompt_id",
+            type=str,
+            not_allowed_for=[DataSourceType.human],
+        ),
     ]
     @model_validator(mode="after")
@@ -464,13 +540,39 @@ class TaskRun(KilnParentedModel):
         description="Tags for the task run. Tags are used to categorize task runs for filtering and reporting.",
     )
+    def has_thinking_training_data(self) -> bool:
+        """
+        Does this run have thinking data that we can use to train a thinking model?
+        """
+        if self.intermediate_outputs is None:
+            return False
+        return (
+            "chain_of_thought" in self.intermediate_outputs
+            or "reasoning" in self.intermediate_outputs
+        )
     def parent_task(self) -> Task | None:
         if not isinstance(self.parent, Task):
             return None
         return self.parent
     @model_validator(mode="after")
-    def validate_input_format(self) -> Self:
+    def validate_input_format(self, info: ValidationInfo) -> Self:
+        # Don't validate if loading from file (not new). Too slow.
+        # We don't allow changing task schema, so this is redundant validation.
+        # Note: we still validate if editing a loaded model
+        if self.loading_from_file(info):
+            # Consider loading an existing model as validated.
+            self._last_validated_input = self.input
+            return self
+        # Don't validate if input has not changed. Too slow to run this every time.
+        if (
+            hasattr(self, "_last_validated_input")
+            and self.input == self._last_validated_input
+        ):
+            return self
         task = self.parent_task()
         if task is None:
             # don't validate this relationship until we have a path or parent. Give them time to build it (but will catch it before saving)
@@ -484,15 +586,33 @@ class TaskRun(KilnParentedModel):
                 raise ValueError("Input is not a valid JSON object")
             except jsonschema.exceptions.ValidationError as e:
                 raise ValueError(f"Input does not match task input schema: {e}")
+        self._last_validated_input = self.input
         return self
     @model_validator(mode="after")
-    def validate_output_format(self) -> Self:
+    def validate_output_format(self, info: ValidationInfo) -> Self:
+        # Don't validate if loading from file (not new). Too slow.
+        # Note: we still validate if editing a loaded model's output.
+        if self.loading_from_file(info):
+            # Consider loading an existing model as validated.
+            self._last_validated_output = self.output.output if self.output else None
+            return self
+        # Don't validate unless output has changed since last validation.
+        # The validator is slow and costly, don't want it running when setting other fields.
+        if (
+            hasattr(self, "_last_validated_output")
+            and self.output is not None
+            and self.output.output == self._last_validated_output
+        ):
+            return self
         task = self.parent_task()
         if task is None:
             return self
         self.output.validate_output_format(task)
+        self._last_validated_output = self.output.output if self.output else None
         return self
     @model_validator(mode="after")
@@ -544,11 +664,47 @@ def AllDatasetFilter(_: TaskRun) -> bool:
 def HighRatingDatasetFilter(task_run: TaskRun) -> bool:
-    if task_run.output is None or task_run.output.rating is None:
+    if task_run.output is None:
+        return False
+    if task_run.repaired_output is not None:
+        # Repairs always considered high quality
+        return True
+    if task_run.output.rating is None:
         return False
     return task_run.output.rating.is_high_quality()
+def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool:
+    """
+    A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought)
+    """
+    return task_run.has_thinking_training_data()
+def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool:
+    """
+    A filter that returns True if the task has thinking data and the output is high quality
+    """
+    return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run)
+class DatasetFilterType(str, Enum):
+    """Dataset filter names."""
+    ALL = "all"
+    HIGH_RATING = "high_rating"
+    THINKING_MODEL = "thinking_model"
+    THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated"
+dataset_filters = {
+    DatasetFilterType.ALL: AllDatasetFilter,
+    DatasetFilterType.HIGH_RATING: HighRatingDatasetFilter,
+    DatasetFilterType.THINKING_MODEL: ThinkingModelDatasetFilter,
+    DatasetFilterType.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter,
+}
 class DatasetSplitDefinition(BaseModel):
     """
     A definition of a split in a dataset.
@@ -580,6 +736,11 @@ Train60Test20Val20SplitDefinition: list[DatasetSplitDefinition] = [
     DatasetSplitDefinition(name="test", percentage=0.2),
     DatasetSplitDefinition(name="val", percentage=0.2),
 ]
+Train80Test10Val10SplitDefinition: list[DatasetSplitDefinition] = [
+    DatasetSplitDefinition(name="train", percentage=0.8),
+    DatasetSplitDefinition(name="test", percentage=0.1),
+    DatasetSplitDefinition(name="val", percentage=0.1),
+]
 class DatasetSplit(KilnParentedModel):
@@ -603,6 +764,10 @@ class DatasetSplit(KilnParentedModel):
     split_contents: dict[str, list[str]] = Field(
         description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.",
     )
+    filter: DatasetFilterType | None = Field(
+        default=None,
+        description="The filter used to build the dataset.",
+    )
     @model_validator(mode="after")
     def validate_split_percentages(self) -> "DatasetSplit":
@@ -617,12 +782,13 @@ class DatasetSplit(KilnParentedModel):
         name: str,
         task: "Task",
         splits: list[DatasetSplitDefinition],
-        filter: DatasetFilter = AllDatasetFilter,
+        filter_type: DatasetFilterType = DatasetFilterType.ALL,
         description: str | None = None,
     ):
         """
         Build a dataset split from a task.
         """
+        filter = dataset_filters[filter_type]
         split_contents = cls.build_split_contents(task, splits, filter)
         return cls(
             parent=task,
@@ -630,6 +796,7 @@ class DatasetSplit(KilnParentedModel):
             description=description,
             splits=splits,
             split_contents=split_contents,
+            filter=filter_type,
         )
     @classmethod
@@ -680,7 +847,7 @@ class DatasetSplit(KilnParentedModel):
         if parent is None:
             raise ValueError("DatasetSplit has no parent task")
-        runs = parent.runs()
+        runs = parent.runs(readonly=True)
         all_ids = set(run.id for run in runs)
         all_ids_in_splits = set()
         for ids in self.split_contents.values():
@@ -689,6 +856,22 @@ class DatasetSplit(KilnParentedModel):
         return len(missing)
+class Prompt(KilnParentedModel):
+    """
+    A prompt for a task.
+    """
+    name: str = NAME_FIELD
+    prompt: str = Field(
+        description="The prompt for the task.",
+        min_length=1,
+    )
+    chain_of_thought_instructions: str | None = Field(
+        default=None,
+        description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.",
+    )
 class TaskRequirement(BaseModel):
     """
     Defines a specific requirement that should be met by task outputs.
@@ -726,6 +909,7 @@ class Task(
         "runs": TaskRun,
         "dataset_splits": DatasetSplit,
         "finetunes": Finetune,
+        "prompts": Prompt,
     },
 ):
     """
@@ -762,15 +946,18 @@ class Task(
             return None
         return schema_from_json_str(self.input_json_schema)
-    # Needed for typechecking. TODO P2: fix this in KilnParentModel
-    def runs(self) -> list[TaskRun]:
-        return super().runs()  # type: ignore
+    # These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
+    def runs(self, readonly: bool = False) -> list[TaskRun]:
+        return super().runs(readonly=readonly)  # type: ignore
+    def dataset_splits(self, readonly: bool = False) -> list[DatasetSplit]:
+        return super().dataset_splits(readonly=readonly)  # type: ignore
-    def dataset_splits(self) -> list[DatasetSplit]:
-        return super().dataset_splits()  # type: ignore
+    def finetunes(self, readonly: bool = False) -> list[Finetune]:
+        return super().finetunes(readonly=readonly)  # type: ignore
-    def finetunes(self) -> list[Finetune]:
-        return super().finetunes()  # type: ignore
+    def prompts(self, readonly: bool = False) -> list[Prompt]:
+        return super().prompts(readonly=readonly)  # type: ignore
 class Project(KilnParentModel, parent_of={"tasks": Task}):

kiln_ai/datamodel/basemodel.py CHANGED Viewed

@@ -120,11 +120,12 @@ class KilnBaseModel(BaseModel):
         return cls.load_from_file(path)
     @classmethod
-    def load_from_file(cls: Type[T], path: Path | str) -> T:
+    def load_from_file(cls: Type[T], path: Path | str, readonly: bool = False) -> T:
         """Load a model instance from a specific file path.
         Args:
             path (Path): Path to the model file
+            readonly (bool): If True, the model will be returned in readonly mode (cached instance, not a copy, not safe to mutate)
         Returns:
             T: Instance of the model
@@ -135,10 +136,10 @@ class KilnBaseModel(BaseModel):
         """
         if isinstance(path, str):
             path = Path(path)
-        cached_model = ModelCache.shared().get_model(path, cls)
+        cached_model = ModelCache.shared().get_model(path, cls, readonly=readonly)
         if cached_model is not None:
             return cached_model
-        with open(path, "r") as file:
+        with open(path, "r", encoding="utf-8") as file:
             # modified time of file for cache invalidation. From file descriptor so it's atomic w read.
             mtime_ns = os.fstat(file.fileno()).st_mtime_ns
             file_data = file.read()
@@ -168,13 +169,20 @@ class KilnBaseModel(BaseModel):
         # Two methods of indicated it's loaded from file:
         # 1) info.context.get("loading_from_file") -> During actual loading, before we can set _loaded_from_file
         # 2) self._loaded_from_file -> After loading, set by the loader
+        if self.loading_from_file(info):
+            return True
+        return self._loaded_from_file
+    # indicates the model is currently being loaded from file (not mutating it after)
+    def loading_from_file(self, info: ValidationInfo | None = None) -> bool:
+        # info.context.get("loading_from_file") -> During actual loading, before we can set _loaded_from_file
         if (
             info is not None
             and info.context is not None
             and info.context.get("loading_from_file", False)
         ):
             return True
-        return self._loaded_from_file
+        return False
     def save_to_file(self) -> None:
         """Save the model instance to a file.
@@ -190,7 +198,7 @@ class KilnBaseModel(BaseModel):
             )
         path.parent.mkdir(parents=True, exist_ok=True)
         json_data = self.model_dump_json(indent=2, exclude={"path"})
-        with open(path, "w") as file:
+        with open(path, "w", encoding="utf-8") as file:
             file.write(json_data)
         # save the path so even if something like name changes, the file doesn't move
         self.path = path
@@ -342,16 +350,28 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
             return []
         # Collect all /relationship/{id}/{base_filename.kiln} files in the relationship folder
-        for child_file in relationship_folder.glob(f"**/{cls.base_filename()}"):
-            yield child_file
+        # manual code instead of glob for performance (5x speedup over glob)
+        base_filename = cls.base_filename()
+        # Iterate through immediate subdirectories using scandir for better performance
+        # Benchmark: scandir is 10x faster than glob, so worth the extra code
+        with os.scandir(relationship_folder) as entries:
+            for entry in entries:
+                if not entry.is_dir():
+                    continue
+                child_file = Path(entry.path) / base_filename
+                if child_file.is_file():
+                    yield child_file
     @classmethod
     def all_children_of_parent_path(
-        cls: Type[PT], parent_path: Path | None
+        cls: Type[PT], parent_path: Path | None, readonly: bool = False
     ) -> list[PT]:
         children = []
         for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
-            children.append(cls.load_from_file(child_path))
+            item = cls.load_from_file(child_path, readonly=readonly)
+            children.append(item)
         return children
     @classmethod
@@ -394,8 +414,8 @@ class KilnParentModel(KilnBaseModel, metaclass=ABCMeta):
     def _create_child_method(
         cls, relationship_name: str, child_class: Type[KilnParentedModel]
     ):
-        def child_method(self) -> list[child_class]:
-            return child_class.all_children_of_parent_path(self.path)
+        def child_method(self, readonly: bool = False) -> list[child_class]:
+            return child_class.all_children_of_parent_path(self.path, readonly=readonly)
         child_method.__name__ = relationship_name
         child_method.__annotations__ = {"return": List[child_class]}

kiln_ai/datamodel/json_schema.py CHANGED Viewed

@@ -42,9 +42,14 @@ def validate_schema(instance: Dict, schema_str: str) -> None:
         jsonschema.exceptions.ValidationError: If validation fails
         ValueError: If the schema is invalid
     """
-    schema = schema_from_json_str(schema_str)
-    v = jsonschema.Draft202012Validator(schema)
-    return v.validate(instance)
+    try:
+        schema = schema_from_json_str(schema_str)
+        v = jsonschema.Draft202012Validator(schema)
+        v.validate(instance)
+    except jsonschema.exceptions.ValidationError as e:
+        raise ValueError(
+            f"This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema. Search 'Troubleshooting Structured Data Issues' in our docs for more information. The error from the schema check was: {e.message}"
+        ) from e
 def schema_from_json_str(v: str) -> Dict:

kiln_ai/datamodel/model_cache.py CHANGED Viewed

@@ -62,12 +62,17 @@ class ModelCache:
             raise ValueError(f"Model at {path} is not of type {model_type.__name__}")
         return model
-    def get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
-        # We return a copy so in-memory edits don't impact the cache until they are saved
+    def get_model(
+        self, path: Path, model_type: Type[T], readonly: bool = False
+    ) -> Optional[T]:
+        # We return a copy by default, so in-memory edits don't impact the cache until they are saved
         # Benchmark shows about 2x slower, but much more foolproof
         model = self._get_model(path, model_type)
         if model:
-            return model.model_copy(deep=True)
+            if readonly:
+                return model
+            else:
+                return model.model_copy(deep=True)
         return None
     def get_model_id(self, path: Path, model_type: Type[T]) -> Optional[str]:

kiln_ai/datamodel/test_basemodel.py CHANGED Viewed

@@ -6,6 +6,9 @@ from unittest.mock import MagicMock, patch
 import pytest
+from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter
+from kiln_ai.adapters.run_output import RunOutput
+from kiln_ai.datamodel import Task, TaskRun
 from kiln_ai.datamodel.basemodel import (
     KilnBaseModel,
     KilnParentedModel,
@@ -356,7 +359,9 @@ def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
     model = KilnBaseModel.load_from_file(test_base_file)
     # Check that the cache was checked and set
-    tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
+    tmp_model_cache.get_model.assert_called_once_with(
+        test_base_file, KilnBaseModel, readonly=False
+    )
     tmp_model_cache.set_model.assert_called_once()
     # Ensure the model is correctly loaded
@@ -407,7 +412,9 @@ def test_load_from_file_with_cached_model(test_base_file, tmp_model_cache):
         model = KilnBaseModel.load_from_file(test_base_file)
         # Check that the cache was checked and the cached model was returned
-        tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
+        tmp_model_cache.get_model.assert_called_once_with(
+            test_base_file, KilnBaseModel, readonly=False
+        )
         assert model is cached_model
         # Assert that open was not called (we used the cached model, not file)
@@ -469,3 +476,75 @@ def test_from_id_and_parent_path_without_parent():
     # Test with None parent_path
     not_found = DefaultParentedModel.from_id_and_parent_path("any-id", None)
     assert not_found is None
+class MockAdapter(BaseAdapter):
+    """Implementation of BaseAdapter for testing"""
+    async def _run(self, input):
+        return RunOutput(output="test output", intermediate_outputs=None)
+    def adapter_info(self) -> AdapterInfo:
+        return AdapterInfo(
+            adapter_name="test",
+            model_name=self.model_name,
+            model_provider=self.model_provider_name,
+            prompt_builder_name="test",
+        )
+@pytest.fixture
+def base_task():
+    return Task(name="test_task", instruction="test_instruction")
+@pytest.fixture
+def adapter(base_task):
+    return MockAdapter(
+        kiln_task=base_task,
+        model_name="test_model",
+        model_provider_name="test_provider",
+    )
+async def test_invoke_parsing_flow(adapter):
+    # Mock dependencies
+    mock_provider = MagicMock()
+    mock_provider.parser = "test_parser"
+    mock_parser = MagicMock()
+    mock_parser.parse_output.return_value = RunOutput(
+        output="parsed test output", intermediate_outputs={"key": "value"}
+    )
+    mock_parser_class = MagicMock(return_value=mock_parser)
+    with (
+        patch.object(adapter, "model_provider", return_value=mock_provider),
+        patch(
+            "kiln_ai.adapters.model_adapters.base_adapter.model_parser_from_id",
+            return_value=mock_parser_class,
+        ),
+        patch("kiln_ai.adapters.model_adapters.base_adapter.Config") as mock_config,
+    ):
+        # Disable autosaving for this test
+        mock_config.shared.return_value.autosave_runs = False
+        mock_config.shared.return_value.user_id = "test_user_id"
+        # Execute
+        result = await adapter.invoke("test input")
+        # Verify parser was created correctly
+        mock_parser_class.assert_called_once_with(structured_output=False)
+        # Verify parsing occurred
+        mock_parser.parse_output.assert_called_once()
+        parsed_args = mock_parser.parse_output.call_args[1]
+        assert isinstance(parsed_args["original_output"], RunOutput)
+        assert parsed_args["original_output"].output == "test output"
+        # Verify result contains parsed output
+        assert isinstance(result, TaskRun)
+        assert result.output.output == "parsed test output"
+        assert result.intermediate_outputs == {"key": "value"}
+        assert result.input == "test input"

kiln-ai 0.8.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.8.0py3-none-any.whl → 0.11.1py3-none-any.whl