kiln-ai 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kiln-ai might be problematic. Click here for more details.
- kiln_ai/adapters/ml_model_list.py +34 -12
- kiln_ai/adapters/ollama_tools.py +4 -3
- kiln_ai/adapters/provider_tools.py +15 -2
- kiln_ai/adapters/repair/repair_task.py +4 -2
- kiln_ai/adapters/test_langchain_adapter.py +183 -0
- kiln_ai/adapters/test_provider_tools.py +220 -1
- kiln_ai/datamodel/__init__.py +55 -5
- kiln_ai/datamodel/basemodel.py +92 -38
- kiln_ai/datamodel/model_cache.py +116 -0
- kiln_ai/datamodel/test_basemodel.py +138 -3
- kiln_ai/datamodel/test_model_cache.py +244 -0
- kiln_ai/datamodel/test_models.py +124 -0
- kiln_ai/utils/config.py +5 -1
- kiln_ai-0.7.1.dist-info/METADATA +237 -0
- {kiln_ai-0.7.0.dist-info → kiln_ai-0.7.1.dist-info}/RECORD +17 -15
- {kiln_ai-0.7.0.dist-info → kiln_ai-0.7.1.dist-info}/WHEEL +1 -1
- kiln_ai-0.7.0.dist-info/METADATA +0 -90
- {kiln_ai-0.7.0.dist-info → kiln_ai-0.7.1.dist-info}/licenses/LICENSE.txt +0 -0
kiln_ai/datamodel/__init__.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
"""
|
|
2
|
+
See our docs for details about our datamodel: https://kiln-ai.github.io/Kiln/kiln_core_docs/kiln_ai.html
|
|
3
|
+
"""
|
|
4
|
+
|
|
1
5
|
from __future__ import annotations
|
|
2
6
|
|
|
3
7
|
import json
|
|
@@ -8,7 +12,12 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Type, Union
|
|
|
8
12
|
|
|
9
13
|
import jsonschema
|
|
10
14
|
import jsonschema.exceptions
|
|
11
|
-
from pydantic import
|
|
15
|
+
from pydantic import (
|
|
16
|
+
BaseModel,
|
|
17
|
+
Field,
|
|
18
|
+
ValidationInfo,
|
|
19
|
+
model_validator,
|
|
20
|
+
)
|
|
12
21
|
from typing_extensions import Self
|
|
13
22
|
|
|
14
23
|
from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str
|
|
@@ -43,9 +52,25 @@ __all__ = [
|
|
|
43
52
|
"TaskOutputRatingType",
|
|
44
53
|
"TaskRequirement",
|
|
45
54
|
"TaskDeterminism",
|
|
55
|
+
"strict_mode",
|
|
56
|
+
"set_strict_mode",
|
|
46
57
|
]
|
|
47
58
|
|
|
48
59
|
|
|
60
|
+
# We want to be hard on ourselves for data completeness generated by the Kiln App, but don't want to make it hard for users to use the datamodel/library.
|
|
61
|
+
# Strict mode enables extra validations that we want to enforce in Kiln App (and any other client that wants best practices), but not in the library (unless they opt in)
|
|
62
|
+
_strict_mode: bool = False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def strict_mode() -> bool:
|
|
66
|
+
return _strict_mode
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def set_strict_mode(value: bool) -> None:
|
|
70
|
+
global _strict_mode
|
|
71
|
+
_strict_mode = value
|
|
72
|
+
|
|
73
|
+
|
|
49
74
|
class Priority(IntEnum):
|
|
50
75
|
"""Defines priority levels for tasks and requirements, where P0 is highest priority."""
|
|
51
76
|
|
|
@@ -121,8 +146,9 @@ class TaskOutput(KilnBaseModel):
|
|
|
121
146
|
output: str = Field(
|
|
122
147
|
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
|
|
123
148
|
)
|
|
124
|
-
source: DataSource = Field(
|
|
125
|
-
description="The source of the output: human or synthetic."
|
|
149
|
+
source: DataSource | None = Field(
|
|
150
|
+
description="The source of the output: human or synthetic.",
|
|
151
|
+
default=None,
|
|
126
152
|
)
|
|
127
153
|
rating: TaskOutputRating | None = Field(
|
|
128
154
|
default=None, description="The rating of the output"
|
|
@@ -139,6 +165,18 @@ class TaskOutput(KilnBaseModel):
|
|
|
139
165
|
raise ValueError(f"Output does not match task output schema: {e}")
|
|
140
166
|
return self
|
|
141
167
|
|
|
168
|
+
@model_validator(mode="after")
|
|
169
|
+
def validate_output_source(self, info: ValidationInfo) -> Self:
|
|
170
|
+
# On strict mode and not loaded from file, we validate output_source is not None.
|
|
171
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
172
|
+
if not strict_mode():
|
|
173
|
+
return self
|
|
174
|
+
if self.loaded_from_file(info):
|
|
175
|
+
return self
|
|
176
|
+
if self.source is None:
|
|
177
|
+
raise ValueError("Output source is required when strict mode is enabled")
|
|
178
|
+
return self
|
|
179
|
+
|
|
142
180
|
|
|
143
181
|
class FineTuneStatusType(str, Enum):
|
|
144
182
|
"""
|
|
@@ -326,8 +364,8 @@ class TaskRun(KilnParentedModel):
|
|
|
326
364
|
input: str = Field(
|
|
327
365
|
description="The inputs to the task. JSON formatted for structured input, plaintext for unstructured input."
|
|
328
366
|
)
|
|
329
|
-
input_source: DataSource = Field(
|
|
330
|
-
description="The source of the input: human or synthetic."
|
|
367
|
+
input_source: DataSource | None = Field(
|
|
368
|
+
default=None, description="The source of the input: human or synthetic."
|
|
331
369
|
)
|
|
332
370
|
|
|
333
371
|
output: TaskOutput = Field(description="The output of the task run.")
|
|
@@ -392,6 +430,18 @@ class TaskRun(KilnParentedModel):
|
|
|
392
430
|
)
|
|
393
431
|
return self
|
|
394
432
|
|
|
433
|
+
@model_validator(mode="after")
|
|
434
|
+
def validate_input_source(self, info: ValidationInfo) -> Self:
|
|
435
|
+
# On strict mode and not loaded from file, we validate input_source is not None.
|
|
436
|
+
# We want to be able to load any data, even if it's not perfect. But we want to create perfect data when adding new data.
|
|
437
|
+
if not strict_mode():
|
|
438
|
+
return self
|
|
439
|
+
if self.loaded_from_file(info):
|
|
440
|
+
return self
|
|
441
|
+
if self.input_source is None:
|
|
442
|
+
raise ValueError("input_source is required when strict mode is enabled")
|
|
443
|
+
return self
|
|
444
|
+
|
|
395
445
|
|
|
396
446
|
# Define the type alias for clarity
|
|
397
447
|
DatasetFilter = Callable[[TaskRun], bool]
|
kiln_ai/datamodel/basemodel.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import re
|
|
3
4
|
import shutil
|
|
4
5
|
import uuid
|
|
@@ -7,7 +8,6 @@ from builtins import classmethod
|
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import (
|
|
10
|
-
TYPE_CHECKING,
|
|
11
11
|
Any,
|
|
12
12
|
Dict,
|
|
13
13
|
List,
|
|
@@ -21,12 +21,14 @@ from pydantic import (
|
|
|
21
21
|
ConfigDict,
|
|
22
22
|
Field,
|
|
23
23
|
ValidationError,
|
|
24
|
+
ValidationInfo,
|
|
24
25
|
computed_field,
|
|
25
26
|
model_validator,
|
|
26
27
|
)
|
|
27
28
|
from pydantic_core import ErrorDetails
|
|
28
29
|
from typing_extensions import Self
|
|
29
30
|
|
|
31
|
+
from kiln_ai.datamodel.model_cache import ModelCache
|
|
30
32
|
from kiln_ai.utils.config import Config
|
|
31
33
|
from kiln_ai.utils.formatting import snake_case
|
|
32
34
|
|
|
@@ -39,6 +41,7 @@ ID_TYPE = Optional[str]
|
|
|
39
41
|
T = TypeVar("T", bound="KilnBaseModel")
|
|
40
42
|
PT = TypeVar("PT", bound="KilnParentedModel")
|
|
41
43
|
|
|
44
|
+
|
|
42
45
|
# Naming conventions:
|
|
43
46
|
# 1) Names are filename safe as they may be used as file names. They are informational and not to be used in prompts/training/validation.
|
|
44
47
|
# 2) Descrptions are for Kiln users to describe/understanding the purpose of this object. They must never be used in prompts/training/validation. Use "instruction/requirements" instead.
|
|
@@ -87,6 +90,8 @@ class KilnBaseModel(BaseModel):
|
|
|
87
90
|
created_at: datetime = Field(default_factory=datetime.now)
|
|
88
91
|
created_by: str = Field(default_factory=lambda: Config.shared().user_id)
|
|
89
92
|
|
|
93
|
+
_loaded_from_file: bool = False
|
|
94
|
+
|
|
90
95
|
@computed_field()
|
|
91
96
|
def model_type(self) -> str:
|
|
92
97
|
return self.type_name()
|
|
@@ -115,7 +120,7 @@ class KilnBaseModel(BaseModel):
|
|
|
115
120
|
return cls.load_from_file(path)
|
|
116
121
|
|
|
117
122
|
@classmethod
|
|
118
|
-
def load_from_file(cls: Type[T], path: Path) -> T:
|
|
123
|
+
def load_from_file(cls: Type[T], path: Path | str) -> T:
|
|
119
124
|
"""Load a model instance from a specific file path.
|
|
120
125
|
|
|
121
126
|
Args:
|
|
@@ -128,14 +133,26 @@ class KilnBaseModel(BaseModel):
|
|
|
128
133
|
ValueError: If the loaded model is not of the expected type or version
|
|
129
134
|
FileNotFoundError: If the file does not exist
|
|
130
135
|
"""
|
|
136
|
+
if isinstance(path, str):
|
|
137
|
+
path = Path(path)
|
|
138
|
+
cached_model = ModelCache.shared().get_model(path, cls)
|
|
139
|
+
if cached_model is not None:
|
|
140
|
+
return cached_model
|
|
131
141
|
with open(path, "r") as file:
|
|
142
|
+
# modified time of file for cache invalidation. From file descriptor so it's atomic w read.
|
|
143
|
+
mtime_ns = os.fstat(file.fileno()).st_mtime_ns
|
|
132
144
|
file_data = file.read()
|
|
133
145
|
# TODO P2 perf: parsing the JSON twice here.
|
|
134
146
|
# Once for model_type, once for model. Can't call model_validate with parsed json because enum types break; they get strings instead of enums.
|
|
135
147
|
parsed_json = json.loads(file_data)
|
|
136
|
-
m = cls.model_validate_json(
|
|
148
|
+
m = cls.model_validate_json(
|
|
149
|
+
file_data,
|
|
150
|
+
strict=True,
|
|
151
|
+
context={"loading_from_file": True},
|
|
152
|
+
)
|
|
137
153
|
if not isinstance(m, cls):
|
|
138
154
|
raise ValueError(f"Loaded model is not of type {cls.__name__}")
|
|
155
|
+
m._loaded_from_file = True
|
|
139
156
|
file_data = None
|
|
140
157
|
m.path = path
|
|
141
158
|
if m.v > m.max_schema_version():
|
|
@@ -150,8 +167,21 @@ class KilnBaseModel(BaseModel):
|
|
|
150
167
|
f"Class: {m.__class__.__name__}, id: {getattr(m, 'id', None)}, path: {path}, "
|
|
151
168
|
f"version: {m.v}, max version: {m.max_schema_version()}"
|
|
152
169
|
)
|
|
170
|
+
ModelCache.shared().set_model(path, m, mtime_ns)
|
|
153
171
|
return m
|
|
154
172
|
|
|
173
|
+
def loaded_from_file(self, info: ValidationInfo | None = None) -> bool:
|
|
174
|
+
# Two methods of indicated it's loaded from file:
|
|
175
|
+
# 1) info.context.get("loading_from_file") -> During actual loading, before we can set _loaded_from_file
|
|
176
|
+
# 2) self._loaded_from_file -> After loading, set by the loader
|
|
177
|
+
if (
|
|
178
|
+
info is not None
|
|
179
|
+
and info.context is not None
|
|
180
|
+
and info.context.get("loading_from_file", False)
|
|
181
|
+
):
|
|
182
|
+
return True
|
|
183
|
+
return self._loaded_from_file
|
|
184
|
+
|
|
155
185
|
def save_to_file(self) -> None:
|
|
156
186
|
"""Save the model instance to a file.
|
|
157
187
|
|
|
@@ -170,6 +200,9 @@ class KilnBaseModel(BaseModel):
|
|
|
170
200
|
file.write(json_data)
|
|
171
201
|
# save the path so even if something like name changes, the file doesn't move
|
|
172
202
|
self.path = path
|
|
203
|
+
# We could save, but invalidating will trigger load on next use.
|
|
204
|
+
# This ensures everything in cache is loaded from disk, and the cache perfectly reflects what's on disk
|
|
205
|
+
ModelCache.shared().invalidate(path)
|
|
173
206
|
|
|
174
207
|
def delete(self) -> None:
|
|
175
208
|
if self.path is None:
|
|
@@ -178,6 +211,7 @@ class KilnBaseModel(BaseModel):
|
|
|
178
211
|
if dir_path is None:
|
|
179
212
|
raise ValueError("Cannot delete model because path is not set")
|
|
180
213
|
shutil.rmtree(dir_path)
|
|
214
|
+
ModelCache.shared().invalidate(self.path)
|
|
181
215
|
self.path = None
|
|
182
216
|
|
|
183
217
|
def build_path(self) -> Path | None:
|
|
@@ -197,51 +231,44 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
|
|
|
197
231
|
including parent reference handling and file system organization.
|
|
198
232
|
|
|
199
233
|
Attributes:
|
|
200
|
-
|
|
234
|
+
parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
|
|
201
235
|
"""
|
|
202
236
|
|
|
203
|
-
|
|
237
|
+
# Parent is an in memory only reference to parent. If it's set we use that. If not we'll try to load it from disk based on the path.
|
|
238
|
+
# We don't persist the parent reference to disk. See the accessors below for how we make it a clean api (parent accessor will lazy load from disk)
|
|
239
|
+
parent: Optional[KilnBaseModel] = Field(default=None, exclude=True)
|
|
204
240
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
241
|
+
def __getattribute__(self, name: str) -> Any:
|
|
242
|
+
if name == "parent":
|
|
243
|
+
return self.load_parent()
|
|
244
|
+
return super().__getattribute__(name)
|
|
208
245
|
|
|
209
|
-
def
|
|
210
|
-
|
|
211
|
-
if "parent" in data:
|
|
212
|
-
self.parent = data["parent"]
|
|
246
|
+
def cached_parent(self) -> Optional[KilnBaseModel]:
|
|
247
|
+
return object.__getattribute__(self, "parent")
|
|
213
248
|
|
|
214
|
-
|
|
215
|
-
def parent(self) -> Optional[KilnBaseModel]:
|
|
249
|
+
def load_parent(self) -> Optional[KilnBaseModel]:
|
|
216
250
|
"""Get the parent model instance, loading it from disk if necessary.
|
|
217
251
|
|
|
218
252
|
Returns:
|
|
219
253
|
Optional[KilnBaseModel]: The parent model instance or None if not set
|
|
220
254
|
"""
|
|
221
|
-
|
|
222
|
-
|
|
255
|
+
cached_parent = self.cached_parent()
|
|
256
|
+
if cached_parent is not None:
|
|
257
|
+
return cached_parent
|
|
258
|
+
|
|
223
259
|
# lazy load parent from path
|
|
224
260
|
if self.path is None:
|
|
225
261
|
return None
|
|
226
|
-
#
|
|
262
|
+
# Note: this only works with base_filename. If we every support custom names, we need to change this.
|
|
227
263
|
parent_path = (
|
|
228
264
|
self.path.parent.parent.parent
|
|
229
265
|
/ self.__class__.parent_type().base_filename()
|
|
230
266
|
)
|
|
231
267
|
if parent_path is None:
|
|
232
268
|
return None
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
@parent.setter
|
|
237
|
-
def parent(self, value: Optional[KilnBaseModel]):
|
|
238
|
-
if value is not None:
|
|
239
|
-
expected_parent_type = self.__class__.parent_type()
|
|
240
|
-
if not isinstance(value, expected_parent_type):
|
|
241
|
-
raise ValueError(
|
|
242
|
-
f"Parent must be of type {expected_parent_type}, but was {type(value)}"
|
|
243
|
-
)
|
|
244
|
-
self._parent = value
|
|
269
|
+
loaded_parent = self.__class__.parent_type().load_from_file(parent_path)
|
|
270
|
+
self.parent = loaded_parent
|
|
271
|
+
return loaded_parent
|
|
245
272
|
|
|
246
273
|
# Dynamically implemented by KilnParentModel method injection
|
|
247
274
|
@classmethod
|
|
@@ -255,11 +282,12 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
|
|
|
255
282
|
|
|
256
283
|
@model_validator(mode="after")
|
|
257
284
|
def check_parent_type(self) -> Self:
|
|
258
|
-
|
|
285
|
+
cached_parent = self.cached_parent()
|
|
286
|
+
if cached_parent is not None:
|
|
259
287
|
expected_parent_type = self.__class__.parent_type()
|
|
260
|
-
if not isinstance(
|
|
288
|
+
if not isinstance(cached_parent, expected_parent_type):
|
|
261
289
|
raise ValueError(
|
|
262
|
-
f"Parent must be of type {expected_parent_type}, but was {type(
|
|
290
|
+
f"Parent must be of type {expected_parent_type}, but was {type(cached_parent)}"
|
|
263
291
|
)
|
|
264
292
|
return self
|
|
265
293
|
|
|
@@ -298,9 +326,7 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
|
|
|
298
326
|
)
|
|
299
327
|
|
|
300
328
|
@classmethod
|
|
301
|
-
def
|
|
302
|
-
cls: Type[PT], parent_path: Path | None
|
|
303
|
-
) -> list[PT]:
|
|
329
|
+
def iterate_children_paths_of_parent_path(cls: Type[PT], parent_path: Path | None):
|
|
304
330
|
if parent_path is None:
|
|
305
331
|
# children are disk based. If not saved, they don't exist
|
|
306
332
|
return []
|
|
@@ -322,13 +348,41 @@ class KilnParentedModel(KilnBaseModel, metaclass=ABCMeta):
|
|
|
322
348
|
return []
|
|
323
349
|
|
|
324
350
|
# Collect all /relationship/{id}/{base_filename.kiln} files in the relationship folder
|
|
325
|
-
children = []
|
|
326
351
|
for child_file in relationship_folder.glob(f"**/{cls.base_filename()}"):
|
|
327
|
-
|
|
328
|
-
children.append(child)
|
|
352
|
+
yield child_file
|
|
329
353
|
|
|
354
|
+
@classmethod
|
|
355
|
+
def all_children_of_parent_path(
|
|
356
|
+
cls: Type[PT], parent_path: Path | None
|
|
357
|
+
) -> list[PT]:
|
|
358
|
+
children = []
|
|
359
|
+
for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
|
|
360
|
+
children.append(cls.load_from_file(child_path))
|
|
330
361
|
return children
|
|
331
362
|
|
|
363
|
+
@classmethod
|
|
364
|
+
def from_id_and_parent_path(
|
|
365
|
+
cls: Type[PT], id: str, parent_path: Path | None
|
|
366
|
+
) -> PT | None:
|
|
367
|
+
"""
|
|
368
|
+
Fast search by ID using the cache. Avoids the model_copy overhead on all but the exact match.
|
|
369
|
+
|
|
370
|
+
Uses cache so still slow on first load.
|
|
371
|
+
"""
|
|
372
|
+
if parent_path is None:
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
# Note: we're using the in-file ID. We could make this faster using the path-ID if this becomes perf bottleneck, but it's better to have 1 source of truth.
|
|
376
|
+
for child_path in cls.iterate_children_paths_of_parent_path(parent_path):
|
|
377
|
+
child_id = ModelCache.shared().get_model_id(child_path, cls)
|
|
378
|
+
if child_id == id:
|
|
379
|
+
return cls.load_from_file(child_path)
|
|
380
|
+
if child_id is None:
|
|
381
|
+
child = cls.load_from_file(child_path)
|
|
382
|
+
if child.id == id:
|
|
383
|
+
return child
|
|
384
|
+
return None
|
|
385
|
+
|
|
332
386
|
|
|
333
387
|
# Parent create methods for all child relationships
|
|
334
388
|
# You must pass in parent_of in the subclass definition, defining the child relationships
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A simple cache for our datamodel.
|
|
3
|
+
|
|
4
|
+
Works at the file level, caching the pydantic model based on the file path.
|
|
5
|
+
|
|
6
|
+
Keeping this really simple. Our goal is to really be "disk-backed" data model, so using disk primitives.
|
|
7
|
+
|
|
8
|
+
- Use disk mtime to determine if the cached model is stale.
|
|
9
|
+
- Still using glob for iterating over projects, just caching at the file level
|
|
10
|
+
- Use path as the cache key
|
|
11
|
+
- Cache always populated from a disk read, so we know it refects what's on disk. Even if we had a memory-constructed version, we don't cache that.
|
|
12
|
+
- Cache the parsed model, not the raw file contents. Parsing and validating is what's expensive. >99% speedup when measured.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
import warnings
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Dict, Optional, Tuple, Type, TypeVar
|
|
20
|
+
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
T = TypeVar("T", bound=BaseModel)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ModelCache:
|
|
27
|
+
_shared_instance = None
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
# Store both the model and the modified time of the cached file contents
|
|
31
|
+
self.model_cache: Dict[Path, Tuple[BaseModel, int]] = {}
|
|
32
|
+
self._enabled = self._check_timestamp_granularity()
|
|
33
|
+
if not self._enabled:
|
|
34
|
+
warnings.warn(
|
|
35
|
+
"File system does not support fine-grained timestamps. "
|
|
36
|
+
"Model caching has been disabled to ensure consistency."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def shared(cls):
|
|
41
|
+
if cls._shared_instance is None:
|
|
42
|
+
cls._shared_instance = cls()
|
|
43
|
+
return cls._shared_instance
|
|
44
|
+
|
|
45
|
+
def _is_cache_valid(self, path: Path, cached_mtime_ns: int) -> bool:
|
|
46
|
+
try:
|
|
47
|
+
current_mtime_ns = path.stat().st_mtime_ns
|
|
48
|
+
except Exception:
|
|
49
|
+
return False
|
|
50
|
+
return cached_mtime_ns == current_mtime_ns
|
|
51
|
+
|
|
52
|
+
def _get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
|
|
53
|
+
if path not in self.model_cache:
|
|
54
|
+
return None
|
|
55
|
+
model, cached_mtime_ns = self.model_cache[path]
|
|
56
|
+
if not self._is_cache_valid(path, cached_mtime_ns):
|
|
57
|
+
self.invalidate(path)
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
if not isinstance(model, model_type):
|
|
61
|
+
self.invalidate(path)
|
|
62
|
+
raise ValueError(f"Model at {path} is not of type {model_type.__name__}")
|
|
63
|
+
return model
|
|
64
|
+
|
|
65
|
+
def get_model(self, path: Path, model_type: Type[T]) -> Optional[T]:
|
|
66
|
+
# We return a copy so in-memory edits don't impact the cache until they are saved
|
|
67
|
+
# Benchmark shows about 2x slower, but much more foolproof
|
|
68
|
+
model = self._get_model(path, model_type)
|
|
69
|
+
if model:
|
|
70
|
+
return model.model_copy(deep=True)
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
def get_model_id(self, path: Path, model_type: Type[T]) -> Optional[str]:
|
|
74
|
+
model = self._get_model(path, model_type)
|
|
75
|
+
if model and hasattr(model, "id"):
|
|
76
|
+
id = model.id # type: ignore
|
|
77
|
+
if isinstance(id, str):
|
|
78
|
+
return id
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def set_model(self, path: Path, model: BaseModel, mtime_ns: int):
|
|
82
|
+
# disable caching if the filesystem doesn't support fine-grained timestamps
|
|
83
|
+
if not self._enabled:
|
|
84
|
+
return
|
|
85
|
+
self.model_cache[path] = (model, mtime_ns)
|
|
86
|
+
|
|
87
|
+
def invalidate(self, path: Path):
|
|
88
|
+
if path in self.model_cache:
|
|
89
|
+
del self.model_cache[path]
|
|
90
|
+
|
|
91
|
+
def clear(self):
|
|
92
|
+
self.model_cache.clear()
|
|
93
|
+
|
|
94
|
+
def _check_timestamp_granularity(self) -> bool:
|
|
95
|
+
"""Check if filesystem supports fine-grained timestamps (microseconds or better)."""
|
|
96
|
+
|
|
97
|
+
# MacOS and Windows support fine-grained timestamps
|
|
98
|
+
if sys.platform in ["darwin", "win32"]:
|
|
99
|
+
return True
|
|
100
|
+
|
|
101
|
+
# Linux supports fine-grained timestamps SOMETIMES. ext4 should work.
|
|
102
|
+
try:
|
|
103
|
+
# Get filesystem stats for the current directory
|
|
104
|
+
stats = os.statvfs(Path(__file__).parent)
|
|
105
|
+
|
|
106
|
+
# f_timespec was added in Linux 5.6 (2020)
|
|
107
|
+
# Returns nanoseconds precision as a power of 10
|
|
108
|
+
# e.g., 1 = decisecond, 2 = centisecond, 3 = millisecond, etc.
|
|
109
|
+
timespec = getattr(stats, "f_timespec", 0)
|
|
110
|
+
|
|
111
|
+
# Consider microsecond precision (6) or better as "fine-grained"
|
|
112
|
+
return timespec >= 6
|
|
113
|
+
except (AttributeError, OSError):
|
|
114
|
+
# If f_timespec isn't available or other errors occur,
|
|
115
|
+
# assume poor granularity to be safe
|
|
116
|
+
return False
|
|
@@ -2,6 +2,7 @@ import datetime
|
|
|
2
2
|
import json
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Optional
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
5
6
|
|
|
6
7
|
import pytest
|
|
7
8
|
|
|
@@ -10,6 +11,7 @@ from kiln_ai.datamodel.basemodel import (
|
|
|
10
11
|
KilnParentedModel,
|
|
11
12
|
string_to_valid_name,
|
|
12
13
|
)
|
|
14
|
+
from kiln_ai.datamodel.model_cache import ModelCache
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
@pytest.fixture
|
|
@@ -45,6 +47,17 @@ def test_newer_file(tmp_path) -> Path:
|
|
|
45
47
|
return test_file_path
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
@pytest.fixture
|
|
51
|
+
def tmp_model_cache():
|
|
52
|
+
temp_cache = ModelCache()
|
|
53
|
+
# We're testing integration, not cache functions, in this file
|
|
54
|
+
temp_cache._enabled = True
|
|
55
|
+
with (
|
|
56
|
+
patch("kiln_ai.datamodel.basemodel.ModelCache.shared", return_value=temp_cache),
|
|
57
|
+
):
|
|
58
|
+
yield temp_cache
|
|
59
|
+
|
|
60
|
+
|
|
48
61
|
def test_load_from_file(test_base_file):
|
|
49
62
|
model = KilnBaseModel.load_from_file(test_base_file)
|
|
50
63
|
assert model.v == 1
|
|
@@ -277,9 +290,8 @@ def test_lazy_load_parent(tmp_path):
|
|
|
277
290
|
assert loaded_parent.name == "Parent"
|
|
278
291
|
assert loaded_parent.path == parent.path
|
|
279
292
|
|
|
280
|
-
# Verify that the
|
|
281
|
-
assert
|
|
282
|
-
assert loaded_child._parent is loaded_parent
|
|
293
|
+
# Verify that the parent is cached
|
|
294
|
+
assert loaded_child.cached_parent() is loaded_parent
|
|
283
295
|
|
|
284
296
|
|
|
285
297
|
def test_delete(tmp_path):
|
|
@@ -334,3 +346,126 @@ def test_string_to_valid_name():
|
|
|
334
346
|
# Test empty string and whitespace
|
|
335
347
|
assert string_to_valid_name("") == ""
|
|
336
348
|
assert string_to_valid_name(" ") == ""
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def test_load_from_file_with_cache(test_base_file, tmp_model_cache):
|
|
352
|
+
tmp_model_cache.get_model = MagicMock(return_value=None)
|
|
353
|
+
tmp_model_cache.set_model = MagicMock()
|
|
354
|
+
|
|
355
|
+
# Load the model
|
|
356
|
+
model = KilnBaseModel.load_from_file(test_base_file)
|
|
357
|
+
|
|
358
|
+
# Check that the cache was checked and set
|
|
359
|
+
tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
|
|
360
|
+
tmp_model_cache.set_model.assert_called_once()
|
|
361
|
+
|
|
362
|
+
# Ensure the model is correctly loaded
|
|
363
|
+
assert model.v == 1
|
|
364
|
+
assert model.path == test_base_file
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def test_save_to_file_invalidates_cache(test_base_file, tmp_model_cache):
|
|
368
|
+
# Create and save the model
|
|
369
|
+
model = KilnBaseModel(path=test_base_file)
|
|
370
|
+
|
|
371
|
+
# Set mock after to ignore any previous calls, we want to see save calls it
|
|
372
|
+
tmp_model_cache.invalidate = MagicMock()
|
|
373
|
+
model.save_to_file()
|
|
374
|
+
|
|
375
|
+
# Check that the cache was invalidated. Might be called multiple times for setting props like path. but must be called at least once.
|
|
376
|
+
tmp_model_cache.invalidate.assert_called_with(test_base_file)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def test_delete_invalidates_cache(tmp_path, tmp_model_cache):
|
|
380
|
+
# Create and save the model
|
|
381
|
+
file_path = tmp_path / "test.kiln"
|
|
382
|
+
model = KilnBaseModel(path=file_path)
|
|
383
|
+
model.save_to_file()
|
|
384
|
+
|
|
385
|
+
# populate and check cache
|
|
386
|
+
model = KilnBaseModel.load_from_file(file_path)
|
|
387
|
+
cached_model = tmp_model_cache.get_model(file_path, KilnBaseModel)
|
|
388
|
+
assert cached_model.id == model.id
|
|
389
|
+
|
|
390
|
+
tmp_model_cache.invalidate = MagicMock()
|
|
391
|
+
|
|
392
|
+
# Delete the model
|
|
393
|
+
model.delete()
|
|
394
|
+
|
|
395
|
+
# Check that the cache was invalidated
|
|
396
|
+
tmp_model_cache.invalidate.assert_called_with(file_path)
|
|
397
|
+
assert tmp_model_cache.get_model(file_path, KilnBaseModel) is None
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def test_load_from_file_with_cached_model(test_base_file, tmp_model_cache):
|
|
401
|
+
# Set up the mock to return a cached model
|
|
402
|
+
cached_model = KilnBaseModel(v=1, path=test_base_file)
|
|
403
|
+
tmp_model_cache.get_model = MagicMock(return_value=cached_model)
|
|
404
|
+
|
|
405
|
+
with patch("builtins.open", create=True) as mock_open:
|
|
406
|
+
# Load the model
|
|
407
|
+
model = KilnBaseModel.load_from_file(test_base_file)
|
|
408
|
+
|
|
409
|
+
# Check that the cache was checked and the cached model was returned
|
|
410
|
+
tmp_model_cache.get_model.assert_called_once_with(test_base_file, KilnBaseModel)
|
|
411
|
+
assert model is cached_model
|
|
412
|
+
|
|
413
|
+
# Assert that open was not called (we used the cached model, not file)
|
|
414
|
+
mock_open.assert_not_called()
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def test_from_id_and_parent_path(test_base_parented_file, tmp_model_cache):
|
|
418
|
+
# Set up parent and children models
|
|
419
|
+
parent = BaseParentExample.load_from_file(test_base_parented_file)
|
|
420
|
+
|
|
421
|
+
child1 = DefaultParentedModel(parent=parent, name="Child1")
|
|
422
|
+
child2 = DefaultParentedModel(parent=parent, name="Child2")
|
|
423
|
+
child3 = DefaultParentedModel(parent=parent, name="Child3")
|
|
424
|
+
|
|
425
|
+
# Save all children
|
|
426
|
+
child1.save_to_file()
|
|
427
|
+
child2.save_to_file()
|
|
428
|
+
child3.save_to_file()
|
|
429
|
+
|
|
430
|
+
# Test finding existing child by ID
|
|
431
|
+
found_child = DefaultParentedModel.from_id_and_parent_path(
|
|
432
|
+
child2.id, test_base_parented_file
|
|
433
|
+
)
|
|
434
|
+
assert found_child is not None
|
|
435
|
+
assert found_child.id == child2.id
|
|
436
|
+
assert found_child.name == "Child2"
|
|
437
|
+
assert found_child is not child2 # not same instance (deep copy)
|
|
438
|
+
|
|
439
|
+
# Test non-existent ID returns None
|
|
440
|
+
not_found = DefaultParentedModel.from_id_and_parent_path(
|
|
441
|
+
"nonexistent", test_base_parented_file
|
|
442
|
+
)
|
|
443
|
+
assert not_found is None
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def test_from_id_and_parent_path_with_cache(test_base_parented_file, tmp_model_cache):
|
|
447
|
+
# Set up parent and child
|
|
448
|
+
parent = BaseParentExample.load_from_file(test_base_parented_file)
|
|
449
|
+
child = DefaultParentedModel(parent=parent, name="Child")
|
|
450
|
+
child.save_to_file()
|
|
451
|
+
|
|
452
|
+
# First load to populate cache
|
|
453
|
+
_ = DefaultParentedModel.from_id_and_parent_path(child.id, test_base_parented_file)
|
|
454
|
+
|
|
455
|
+
# Mock cache to verify it's used
|
|
456
|
+
tmp_model_cache.get_model_id = MagicMock(return_value=child.id)
|
|
457
|
+
|
|
458
|
+
# Load again - should use cache
|
|
459
|
+
found_child = DefaultParentedModel.from_id_and_parent_path(
|
|
460
|
+
child.id, test_base_parented_file
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
assert found_child is not None
|
|
464
|
+
assert found_child.id == child.id
|
|
465
|
+
tmp_model_cache.get_model_id.assert_called()
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def test_from_id_and_parent_path_without_parent():
|
|
469
|
+
# Test with None parent_path
|
|
470
|
+
not_found = DefaultParentedModel.from_id_and_parent_path("any-id", None)
|
|
471
|
+
assert not_found is None
|