PyPI - kiln-ai - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

kiln-ai 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kiln-ai might be problematic. Click here for more details.

Files changed (24) hide show

kiln_ai/adapters/adapter_registry.py +2 -0
kiln_ai/adapters/base_adapter.py +6 -1
kiln_ai/adapters/langchain_adapters.py +5 -1
kiln_ai/adapters/ml_model_list.py +43 -12
kiln_ai/adapters/ollama_tools.py +4 -3
kiln_ai/adapters/provider_tools.py +63 -2
kiln_ai/adapters/repair/repair_task.py +4 -2
kiln_ai/adapters/test_langchain_adapter.py +183 -0
kiln_ai/adapters/test_provider_tools.py +315 -1
kiln_ai/datamodel/__init__.py +162 -19
kiln_ai/datamodel/basemodel.py +90 -42
kiln_ai/datamodel/model_cache.py +116 -0
kiln_ai/datamodel/test_basemodel.py +138 -3
kiln_ai/datamodel/test_dataset_split.py +1 -1
kiln_ai/datamodel/test_model_cache.py +244 -0
kiln_ai/datamodel/test_models.py +173 -0
kiln_ai/datamodel/test_output_rating.py +377 -10
kiln_ai/utils/config.py +33 -10
kiln_ai/utils/test_config.py +48 -0
kiln_ai-0.8.0.dist-info/METADATA +237 -0
{kiln_ai-0.7.0.dist-info → kiln_ai-0.8.0.dist-info}/RECORD +23 -21
{kiln_ai-0.7.0.dist-info → kiln_ai-0.8.0.dist-info}/WHEEL +1 -1
kiln_ai-0.7.0.dist-info/METADATA +0 -90
{kiln_ai-0.7.0.dist-info → kiln_ai-0.8.0.dist-info}/licenses/LICENSE.txt +0 -0

kiln_ai/datamodel/basemodel.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import os
 import re
 import shutil
 import uuid
@@ -7,7 +8,6 @@ from builtins import classmethod
 from datetime import datetime
 from pathlib import Path
 from typing import (
-    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -21,12 +21,14 @@ from pydantic import (
     ConfigDict,
     Field,
     ValidationError,
+    ValidationInfo,
     computed_field,
     model_validator,
 )
 from pydantic_core import ErrorDetails
 from typing_extensions import Self
+from kiln_ai.datamodel.model_cache import ModelCache
 from kiln_ai.utils.config import Config
 from kiln_ai.utils.formatting import snake_case
@@ -39,6 +41,7 @@ ID_TYPE = Optional[str]
 T = TypeVar("T", bound="KilnBaseModel")
 PT = TypeVar("PT", bound="KilnParentedModel")
 # Naming conventions:
 # 1) Names are filename safe as they may be used as file names. They are informational and not to be used in prompts/training/validation.
 # 2) Descrptions are for Kiln users to describe/understanding the purpose of this object. They must never be used in prompts/training/validation. Use "instruction/requirements" instead.
@@ -87,6 +90,8 @@ class KilnBaseModel(BaseModel):
     created_at: datetime = Field(default_factory=datetime.now)
     created_by: str = Field(default_factory=lambda: Config.shared().user_id)
+    _loaded_from_file: bool = False
     @computed_field()
     def model_type(self) -> str:
         return self.type_name()
@@ -115,7 +120,7 @@ class KilnBaseModel(BaseModel):
         return cls.load_from_file(path)
     @classmethod
-    def load_from_file(cls: Type[T], path: Path) -> T:
+    def load_from_file(cls: Type[T], path: Path | str) -> T:
         """Load a model instance from a specific file path.
         Args:
@@ -128,14 +133,20 @@ class KilnBaseModel(BaseModel):
             ValueError: If the loaded model is not of the expected type or version
             FileNotFoundError: If the file does not exist
         """
+        if isinstance(path, str):
+            path = Path(path)
+        cached_model = ModelCache.shared().get_model(path, cls)
+        if cached_model is not None:
+            return cached_model
         with open(path, "r") as file:
+            # modified time of file for cache invalidation. From file descriptor so it's atomic w read.
+            mtime_ns = os.fstat(file.fileno()).st_mtime_ns
             file_data = file.read()
-            # TODO P2 perf: parsing the JSON twice here.
-            # Once for model_type, once for model. Can't call model_validate with parsed json because enum types break; they get strings instead of enums.
             parsed_json = json.loads(file_data)
-            m = cls.model_validate_json(file_data, strict=True)
+            m = cls.model_validate(parsed_json, context={"loading_from_file": True})
             if not isinstance(m, cls):
                 raise ValueError(f"Loaded model is not of type {cls.__name__}")
+            m._loaded_from_file = True
             file_data = None
         m.path = path
         if m.v > m.max_schema_version():
@@ -150,8 +161,21 @@ class KilnBaseModel(BaseModel):
                 f"Class: {m.__class__.__name__}, id: {getattr(m, 'id', None)}, path: {path}, "
                 f"version: {m.v}, max version: {m.max_schema_version()}"
             )
+        ModelCache.shared().set_model(path, m, mtime_ns)
         return m
+    def loaded_from_file(self, info: ValidationInfo | None = None) -> bool:
+        # Two methods of indicated it's loaded from file:
+        # 1) info.context.get("loading_from_file") -> During actual loading, before we can set _loaded_from_file
+        # 2) self._loaded_from_file -> After loading, set by the loader
+        if (
+            info is not None
+            and info.context is not None
+            and info.context.get("loading_from_file", False)
+        ):
+            return True
+        return self._loaded_from_file
     def save_to_file(self) -> None:
         """Save the model instance to a file.
@@ -170,6 +194,9 @@ class KilnBaseModel(BaseModel):
             file.write(json_data)
         # save the path so even if something like name changes, the file doesn't move
         self.path = path
+        # We could save, but invalidating will trigger load on next use.
+        # This ensures everything in cache is loaded from disk, and the cache perfectly reflects what's on disk
+        ModelCache.shared().invalidate(path)
     def delete(self) -> None:
         if self.path is None:
@@ -178,6 +205,7 @@ class KilnBaseModel(BaseModel):
         if dir_path is None:
             raise ValueError("Cannot delete model because path is not set")
         shutil.rmtree(dir_path)
+        ModelCache.shared().invalidate(self.path)
         self.path = None
     def build_path(self) -> Path | None:
@@ -197,51 +225,44 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
     including parent reference handling and file system organization.
     Attributes:
-        _parent (KilnBaseModel): Reference to the parent model instance
+        parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
     """
-    _parent: KilnBaseModel | None = None
+    # Parent is an in memory only reference to parent. If it's set we use that. If not we'll try to load it from disk based on the path.
+    # We don't persist the parent reference to disk. See the accessors below for how we make it a clean api (parent accessor will lazy load from disk)
+    parent: Optional[KilnBaseModel] = Field(default=None, exclude=True)
-    # workaround to tell typechecker that we support the parent property, even though it's not a stock property
-    if TYPE_CHECKING:
-        parent: KilnBaseModel  # type: ignore
+    def __getattribute__(self, name: str) -> Any:
+        if name == "parent":
+            return self.load_parent()
+        return super().__getattribute__(name)
-    def __init__(self, **data):
-        super().__init__(**data)
-        if "parent" in data:
-            self.parent = data["parent"]
+    def cached_parent(self) -> Optional[KilnBaseModel]:
+        return object.__getattribute__(self, "parent")
-    @property
-    def parent(self) -> Optional[KilnBaseModel]:
+    def load_parent(self) -> Optional[KilnBaseModel]:
         """Get the parent model instance, loading it from disk if necessary.
         Returns:
             Optional[KilnBaseModel]: The parent model instance or None if not set
         """
-        if self._parent is not None:
-            return self._parent
+        cached_parent = self.cached_parent()
+        if cached_parent is not None:
+            return cached_parent
         # lazy load parent from path
         if self.path is None:
             return None
-        # TODO: this only works with base_filename. If we every support custom names, we need to change this.
+        # Note: this only works with base_filename. If we every support custom names, we need to change this.
         parent_path = (
             self.path.parent.parent.parent
             / self.__class__.parent_type().base_filename()
         )
         if parent_path is None:
             return None
-        self._parent = self.__class__.parent_type().load_from_file(parent_path)
-        return self._parent
-    @parent.setter
-    def parent(self, value: Optional[KilnBaseModel]):
-        if value is not None:
-            expected_parent_type = self.__class__.parent_type()
-            if not isinstance(value, expected_parent_type):
-                raise ValueError(
-                    f"Parent must be of type {expected_parent_type}, but was {type(value)}"
-                )
-        self._parent = value
+        loaded_parent = self.__class__.parent_type().load_from_file(parent_path)
+        self.parent = loaded_parent
+        return loaded_parent
     # Dynamically implemented by KilnParentModel method injection
     @classmethod
@@ -255,11 +276,12 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
     @model_validator(mode="after")
     def check_parent_type(self) -> Self:
-        if self._parent is not None:
+        cached_parent = self.cached_parent()
+        if cached_parent is not None:
             expected_parent_type = self.__class__.parent_type()
-            if not isinstance(self._parent, expected_parent_type):
+            if not isinstance(cached_parent, expected_parent_type):
                 raise ValueError(
-                    f"Parent must be of type {expected_parent_type}, but was {type(self._parent)}"
+                    f"Parent must be of type {expected_parent_type}, but was {type(cached_parent)}"
                 )
         return self
@@ -298,9 +320,7 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
         )
     @classmethod
-    def all_children_of_parent_path(
-        cls: Type[PT], parent_path: Path | None
-    ) -> list[PT]:
+    def iterate_children_paths_of_parent_path(cls: Type[PT], parent_path: Path | None):
         if parent_path is None:
             # children are disk based. If not saved, they don't exist
             return []
@@ -322,13 +342,41 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
             return []
         # Collect all /relationship/{id}/{base_filename.kiln} files in the relationship folder
-        children = []
         for child_file in relationship_folder.glob(f"**/{cls.base_filename()}"):
-            child = cls.load_from_file(child_file)
-            children.append(child)
+            yield child_file
+    @classmethod
+    def all_children_of_parent_path(
+        cls: Type[PT], parent_path: Path | None
+    ) -> list[PT]:
+        children = []
+        for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
+            children.append(cls.load_from_file(child_path))
         return children
+    @classmethod
+    def from_id_and_parent_path(
+        cls: Type[PT], id: str, parent_path: Path | None
+    ) -> PT | None:
+        """
+        Fast search by ID using the cache. Avoids the model_copy overhead on all but the exact match.
+        Uses cache so still slow on first load.
+        """
+        if parent_path is None:
+            return None
+        # Note: we're using the in-file ID. We could make this faster using the path-ID if this becomes perf bottleneck, but it's better to have 1 source of truth.
+        for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
+            child_id = ModelCache.shared().get_model_id(child_path, cls)
+            if child_id == id:
+                return cls.load_from_file(child_path)
+            if child_id is None:
+                child = cls.load_from_file(child_path)
+                if child.id == id:
+                    return child
+        return None
 # Parent create methods for all child relationships
 # You must pass in parent_of in the subclass definition, defining the child relationships
@@ -417,7 +465,7 @@ class KilnParentModel(KilnBaseModel, metaclass=ABCMeta):
         validation_errors = []
         try:
-            instance = cls.model_validate(data, strict=True)
+            instance = cls.model_validate(data)
             if path is not None:
                 instance.path = path
             if parent is not None and isinstance(instance, KilnParentedModel):
@@ -445,7 +493,7 @@ class KilnParentModel(KilnBaseModel, metaclass=ABCMeta):
                             parent_type._validate_nested(**kwargs)
                         elif issubclass(parent_type, KilnParentedModel):
                             # Root node
-                            subinstance = parent_type.model_validate(value, strict=True)
+                            subinstance = parent_type.model_validate(value)
                             if instance is not None:
                                 subinstance.parent = instance
                             if save:

kiln_ai/datamodel/model_cache.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""
+A simple cache for our datamodel.
+Works at the file level, caching the pydantic model based on the file path.
+Keeping this really simple. Our goal is to really be "disk-backed" data model, so using disk primitives.
+ - Use disk mtime to determine if the cached model is stale.
+ - Still using glob for iterating over projects, just caching at the file level
+ - Use path as the cache key
+ - Cache always populated from a disk read, so we know it refects what's on disk. Even if we had a memory-constructed version, we don't cache that.
+ - Cache the parsed model, not the raw file contents. Parsing and validating is what's expensive. >99% speedup when measured.
+"""
+import os
+import sys
+import warnings
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Type, TypeVar
+from pydantic import BaseModel
+T = TypeVar("T", bound=BaseModel)
+class ModelCache:
+    _shared_instance = None
+    def __init__(self):
+        # Store both the model and the modified time of the cached file contents
+        self.model_cache: Dict[Path, Tuple[BaseModel, int]] = {}
+        self._enabled = self._check_timestamp_granularity()
+        if not self._enabled:
+            warnings.warn(
+                "File system does not support fine-grained timestamps. "
+                "Model caching has been disabled to ensure consistency."
+            )
+    @classmethod
+    def shared(cls):
+        if cls._shared_instance is None:
+            cls._shared_instance = cls()
+        return cls._shared_instance
+    def _is_cache_valid(self, path: Path, cached_mtime_ns: int) -> bool:
+        try:
+            current_mtime_ns = path.stat().st_mtime_ns
+        except Exception:
+            return False
+        return cached_mtime_ns == current_mtime_ns
+    def _get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
+        if path not in self.model_cache:
+            return None
+        model, cached_mtime_ns = self.model_cache[path]
+        if not self._is_cache_valid(path, cached_mtime_ns):
+            self.invalidate(path)
+            return None
+        if not isinstance(model, model_type):
+            self.invalidate(path)
+            raise ValueError(f"Model at {path} is not of type {model_type.__name__}")
+        return model
+    def get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
+        # We return a copy so in-memory edits don't impact the cache until they are saved
+        # Benchmark shows about 2x slower, but much more foolproof
+        model = self._get_model(path, model_type)
+        if model:
+            return model.model_copy(deep=True)
+        return None
+    def get_model_id(self, path: Path, model_type: Type[T]) -> Optional[str]:
+        model = self._get_model(path, model_type)
+        if model and hasattr(model, "id"):
+            id = model.id  # type: ignore
+            if isinstance(id, str):
+                return id
+        return None
+    def set_model(self, path: Path, model: BaseModel, mtime_ns: int):
+        # disable caching if the filesystem doesn't support fine-grained timestamps
+        if not self._enabled:
+            return
+        self.model_cache[path] = (model, mtime_ns)
+    def invalidate(self, path: Path):
+        if path in self.model_cache:
+            del self.model_cache[path]
+    def clear(self):
+        self.model_cache.clear()
+    def _check_timestamp_granularity(self) -> bool:
+        """Check if filesystem supports fine-grained timestamps (microseconds or better)."""
+        # MacOS and Windows support fine-grained timestamps
+        if sys.platform in ["darwin", "win32"]:
+            return True
+        # Linux supports fine-grained timestamps SOMETIMES. ext4 should work.
+        try:
+            # Get filesystem stats for the current directory
+            stats = os.statvfs(Path(__file__).parent)
+            # f_timespec was added in Linux 5.6 (2020)
+            # Returns nanoseconds precision as a power of 10
+            # e.g., 1 = decisecond, 2 = centisecond, 3 = millisecond, etc.
+            timespec = getattr(stats, "f_timespec", 0)
+            # Consider microsecond precision (6) or better as "fine-grained"
+            return timespec >= 6
+        except (AttributeError, OSError):
+            # If f_timespec isn't available or other errors occur,
+            # assume poor granularity to be safe
+            return False

kiln_ai/datamodel/test_basemodel.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datetime
 import json
 from pathlib import Path
 from typing import Optional
+from unittest.mock import MagicMock, patch
 import pytest
@@ -10,6 +11,7 @@ from kiln_ai.datamodel.basemodel import (
     KilnParentedModel,
     string_to_valid_name,
 )
+from kiln_ai.datamodel.model_cache import ModelCache
 @pytest.fixture
@@ -45,6 +47,17 @@ def test_newer_file(tmp_path) -> Path:
     return test_file_path
+@pytest.fixture
+def tmp_model_cache():
+    temp_cache = ModelCache()
+    # We're testing integration, not cache functions, in this file
+    temp_cache._enabled = True
+    with (
+        patch("kiln_ai.datamodel.basemodel.ModelCache.shared", return_value=temp_cache),
+    ):
+        yield temp_cache
 def test_load_from_file(test_base_file):
     model = KilnBaseModel.load_from_file(test_base_file)
     assert model.v == 1
@@ -277,9 +290,8 @@ def test_lazy_load_parent(tmp_path):
     assert loaded_parent.name == "Parent"
     assert loaded_parent.path == parent.path
-    # Verify that the _parent attribute is now set
-    assert hasattr(loaded_child, "_parent")
-    assert loaded_child._parent is loaded_parent
+    # Verify that the parent is cached
+    assert loaded_child.cached_parent() is loaded_parent
 def test_delete(tmp_path):
@@ -334,3 +346,126 @@ def test_string_to_valid_name():
     # Test empty string and whitespace
     assert string_to_valid_name("") == ""
     assert string_to_valid_name("   ") == ""
+def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
+    tmp_model_cache.get_model = MagicMock(return_value=None)
+    tmp_model_cache.set_model = MagicMock()
+    # Load the model
+    model = KilnBaseModel.load_from_file(test_base_file)
+    # Check that the cache was checked and set
+    tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
+    tmp_model_cache.set_model.assert_called_once()
+    # Ensure the model is correctly loaded
+    assert model.v == 1
+    assert model.path == test_base_file
+def test_save_to_file_invalidates_cache(test_base_file, tmp_model_cache):
+    # Create and save the model
+    model = KilnBaseModel(path=test_base_file)
+    # Set mock after to ignore any previous calls, we want to see save calls it
+    tmp_model_cache.invalidate = MagicMock()
+    model.save_to_file()
+    # Check that the cache was invalidated. Might be called multiple times for setting props like path. but must be called at least once.
+    tmp_model_cache.invalidate.assert_called_with(test_base_file)
+def test_delete_invalidates_cache(tmp_path, tmp_model_cache):
+    # Create and save the model
+    file_path = tmp_path / "test.kiln"
+    model = KilnBaseModel(path=file_path)
+    model.save_to_file()
+    # populate and check cache
+    model = KilnBaseModel.load_from_file(file_path)
+    cached_model = tmp_model_cache.get_model(file_path, KilnBaseModel)
+    assert cached_model.id == model.id
+    tmp_model_cache.invalidate = MagicMock()
+    # Delete the model
+    model.delete()
+    # Check that the cache was invalidated
+    tmp_model_cache.invalidate.assert_called_with(file_path)
+    assert tmp_model_cache.get_model(file_path, KilnBaseModel) is None
+def test_load_from_file_with_cached_model(test_base_file, tmp_model_cache):
+    # Set up the mock to return a cached model
+    cached_model = KilnBaseModel(v=1, path=test_base_file)
+    tmp_model_cache.get_model = MagicMock(return_value=cached_model)
+    with patch("builtins.open", create=True) as mock_open:
+        # Load the model
+        model = KilnBaseModel.load_from_file(test_base_file)
+        # Check that the cache was checked and the cached model was returned
+        tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
+        assert model is cached_model
+        # Assert that open was not called (we used the cached model, not file)
+        mock_open.assert_not_called()
+def test_from_id_and_parent_path(test_base_parented_file, tmp_model_cache):
+    # Set up parent and children models
+    parent = BaseParentExample.load_from_file(test_base_parented_file)
+    child1 = DefaultParentedModel(parent=parent, name="Child1")
+    child2 = DefaultParentedModel(parent=parent, name="Child2")
+    child3 = DefaultParentedModel(parent=parent, name="Child3")
+    # Save all children
+    child1.save_to_file()
+    child2.save_to_file()
+    child3.save_to_file()
+    # Test finding existing child by ID
+    found_child = DefaultParentedModel.from_id_and_parent_path(
+        child2.id, test_base_parented_file
+    )
+    assert found_child is not None
+    assert found_child.id == child2.id
+    assert found_child.name == "Child2"
+    assert found_child is not child2  # not same instance (deep copy)
+    # Test non-existent ID returns None
+    not_found = DefaultParentedModel.from_id_and_parent_path(
+        "nonexistent", test_base_parented_file
+    )
+    assert not_found is None
+def test_from_id_and_parent_path_with_cache(test_base_parented_file, tmp_model_cache):
+    # Set up parent and child
+    parent = BaseParentExample.load_from_file(test_base_parented_file)
+    child = DefaultParentedModel(parent=parent, name="Child")
+    child.save_to_file()
+    # First load to populate cache
+    _ = DefaultParentedModel.from_id_and_parent_path(child.id, test_base_parented_file)
+    # Mock cache to verify it's used
+    tmp_model_cache.get_model_id = MagicMock(return_value=child.id)
+    # Load again - should use cache
+    found_child = DefaultParentedModel.from_id_and_parent_path(
+        child.id, test_base_parented_file
+    )
+    assert found_child is not None
+    assert found_child.id == child.id
+    tmp_model_cache.get_model_id.assert_called()
+def test_from_id_and_parent_path_without_parent():
+    # Test with None parent_path
+    not_found = DefaultParentedModel.from_id_and_parent_path("any-id", None)
+    assert not_found is None

kiln_ai/datamodel/test_dataset_split.py CHANGED Viewed

@@ -84,7 +84,7 @@ def task_run():
                 type=DataSourceType.human,
                 properties={"created_by": "test-user"},
             ),
-            rating=TaskOutputRating(rating=5, type=TaskOutputRatingType.five_star),
+            rating=TaskOutputRating(value=5, type=TaskOutputRatingType.five_star),
         ),
     )

kiln-ai 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

kiln-ai 0.7.0py3-none-any.whl → 0.8.0py3-none-any.whl