yes3 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yes3/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .s3 import S3Location, is_s3_url
@@ -0,0 +1,6 @@
1
+ from .base import Cache, CacheCore, CachedItemMeta, Serializer, check_meta_mismatches
2
+ from .local_cache import LocalDiskCache
3
+ from .memory_cache import MemoryCache
4
+ from .multi_cache import MultiCache
5
+ from .s3_cache import S3Cache
6
+ from .setup_helpers import setup_cache
yes3/caching/base.py ADDED
@@ -0,0 +1,302 @@
1
+ from abc import ABCMeta, abstractmethod
2
+ from collections.abc import Callable
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, UTC
5
+ from typing import Iterable, Iterator, Optional, Self
6
+
7
+ _NotSpecified = object()
8
+
9
+
10
+ def raise_not_found(key) -> KeyError:
11
+ raise KeyError(f"key '{key}' not found in cache")
12
+
13
+
14
+ @dataclass
15
+ class CachedItemMeta:
16
+ key: str
17
+ path: Optional[str]
18
+ size: Optional[int]
19
+ timestamp: Optional[datetime]
20
+
21
+ _ts_format = '%Y-%m-%d %H:%M:%S.%f %z'
22
+
23
+ def __post_init__(self):
24
+ if isinstance(self.timestamp, float):
25
+ self.timestamp = datetime.fromtimestamp(self.timestamp, UTC)
26
+ if isinstance(self.timestamp, str):
27
+ self.timestamp = datetime.strptime(self.timestamp, self._ts_format)
28
+
29
+ def to_dict(self) -> dict:
30
+ return {
31
+ 'key': self.key,
32
+ 'path': self.path,
33
+ 'size': self.size,
34
+ 'timestamp': self.timestamp.strftime(self._ts_format) if self.timestamp else None,
35
+ }
36
+
37
+
38
+ class CacheCore(metaclass=ABCMeta):
39
+ def __init__(self, active=True, read_only=False):
40
+ self._read_only = read_only
41
+ self._active = active
42
+
43
+ @abstractmethod
44
+ def __contains__(self, key):
45
+ pass
46
+
47
+ @abstractmethod
48
+ def get(self, key, default=_NotSpecified):
49
+ pass
50
+
51
+ @abstractmethod
52
+ def get_meta(self, key) -> CachedItemMeta:
53
+ pass
54
+
55
+ @abstractmethod
56
+ def put(self, key, obj, update=False, meta: Optional[CachedItemMeta] = None):
57
+ pass
58
+
59
+ @abstractmethod
60
+ def remove(self, key):
61
+ pass
62
+
63
+ @abstractmethod
64
+ def keys(self):
65
+ pass
66
+
67
+ def __getitem__(self, key: str):
68
+ return self.get(key)
69
+
70
+ def __setitem__(self, key: str, obj) -> None:
71
+ self.put(key, obj)
72
+
73
+ def __delitem__(self, key: str) -> None:
74
+ self.remove(key)
75
+
76
+ def is_active(self) -> bool:
77
+ return self._active
78
+
79
+ def activate(self):
80
+ self._active = True
81
+ return self
82
+
83
+ def deactivate(self):
84
+ self._active = False
85
+ return self
86
+
87
+ def is_read_only(self) -> bool:
88
+ return self._read_only
89
+
90
+ def set_read_only(self, value: bool) -> Self:
91
+ self._read_only = value
92
+ return self
93
+
94
+ def update(self, key: str, obj):
95
+ if key not in self:
96
+ raise_not_found(key)
97
+ self.put(key, obj, update=True)
98
+
99
+ def pop(self, key: str, default=_NotSpecified):
100
+ obj = self.get(key, default=default)
101
+ self.remove(key)
102
+ return obj
103
+
104
+ def list(self) -> dict[str, CachedItemMeta]:
105
+ items_meta = {}
106
+ for key in self.keys():
107
+ items_meta[key] = self.get_meta(key)
108
+ return items_meta
109
+
110
+ def subcache(self, *args, **kwargs) -> Self:
111
+ raise NotImplementedError(f"`subcache` method is not defined for class {type(self).__name__}")
112
+
113
+
114
+ class CacheReaderWriter(metaclass=ABCMeta):
115
+ @abstractmethod
116
+ def read(self, key: str):
117
+ pass
118
+
119
+ @abstractmethod
120
+ def get_meta(self, key: str) -> CachedItemMeta:
121
+ pass
122
+
123
+ @abstractmethod
124
+ def write(self, key: str, obj, meta=None) -> CachedItemMeta:
125
+ pass
126
+
127
+ @abstractmethod
128
+ def delete(self, key: str, meta_only=False):
129
+ pass
130
+
131
+
132
+ class CacheCatalog(metaclass=ABCMeta):
133
+ @abstractmethod
134
+ def contains(self, key: str):
135
+ pass
136
+
137
+ @abstractmethod
138
+ def add(self, key: str, info: CachedItemMeta):
139
+ pass
140
+
141
+ @abstractmethod
142
+ def get(self, key: str) -> CachedItemMeta:
143
+ pass
144
+
145
+ @abstractmethod
146
+ def remove(self, key: str):
147
+ pass
148
+
149
+ @abstractmethod
150
+ def keys(self):
151
+ pass
152
+
153
+ @abstractmethod
154
+ def items(self):
155
+ pass
156
+
157
+
158
+ _CatalogT = dict[str, CachedItemMeta]
159
+ _CatalogBuilderT = Callable[[], _CatalogT]
160
+
161
+
162
+ class CacheDictCatalog(CacheCatalog):
163
+ def __init__(
164
+ self,
165
+ catalog: Optional[dict[str, CachedItemMeta]] = None,
166
+ catalog_builder: Optional[_CatalogBuilderT] = None,
167
+ ):
168
+ self._catalog = catalog
169
+ if catalog_builder is None:
170
+ catalog_builder = dict
171
+ self._build_catalog = catalog_builder
172
+ if self._catalog is None:
173
+ self.rebuild()
174
+
175
+ def rebuild(self):
176
+ self._catalog = self._build_catalog().copy()
177
+
178
+ def contains(self, key: str):
179
+ return str(key) in self._catalog
180
+
181
+ def add(self, key: str, meta: CachedItemMeta):
182
+ self._catalog[str(key)] = meta
183
+
184
+ def get(self, key: str) -> CachedItemMeta:
185
+ return self._catalog[str(key)]
186
+
187
+ def remove(self, key: str):
188
+ self._catalog.pop(str(key))
189
+
190
+ def keys(self):
191
+ return list(self._catalog.keys())
192
+
193
+ def items(self) -> Iterator[tuple[str, CachedItemMeta]]:
194
+ return iter(self._catalog.items())
195
+
196
+
197
+ class Cache(CacheCore, metaclass=ABCMeta):
198
+ def __init__(self, catalog: CacheCatalog, reader_writer: CacheReaderWriter, active=True, read_only=False):
199
+ super().__init__(active=active, read_only=read_only)
200
+ self._catalog = catalog
201
+ self._reader_writer = reader_writer
202
+
203
+ @classmethod
204
+ @abstractmethod
205
+ def create(cls, *args, **kwargs):
206
+ pass
207
+
208
+ def __contains__(self, key: str) -> bool:
209
+ if not self.is_active():
210
+ return False
211
+ return self._catalog.contains(key)
212
+
213
+ def get(self, key: str, default=_NotSpecified):
214
+ if not self.is_active() or key not in self:
215
+ if default is _NotSpecified:
216
+ raise_not_found(key)
217
+ else:
218
+ return default
219
+ return self._reader_writer.read(key)
220
+
221
+ def get_meta(self, key: str) -> CachedItemMeta:
222
+ if not self.is_active() or key not in self:
223
+ raise_not_found(key)
224
+ return self._catalog.get(key)
225
+
226
+ def put(self, key: str, obj, *, update=False, meta: Optional[CachedItemMeta] = None) -> Self:
227
+ if self.is_read_only():
228
+ raise TypeError('Cache is in read only mode')
229
+ if self.is_active():
230
+ if key in self and not update:
231
+ raise ValueError(f"key '{key}' already exists in cache; use 'update' to overwrite")
232
+ meta = self._reader_writer.write(key, obj, meta=meta)
233
+ self._catalog.add(key, meta)
234
+ else:
235
+ print(f'WARNING: {type(self).__name__} is not active')
236
+ return self
237
+
238
+ def remove(self, key: str, meta_only=False) -> Self:
239
+ if self.is_active() and key in self:
240
+ if self.is_read_only():
241
+ raise TypeError('Cache is in read only mode')
242
+ self._catalog.remove(key)
243
+ self._reader_writer.delete(key, meta_only=meta_only)
244
+ return self
245
+
246
+ def remove_meta(self, key: str) -> Self:
247
+ return self.remove(key, meta_only=True)
248
+
249
+ def keys(self) -> list[str]:
250
+ if not self.is_active():
251
+ return []
252
+ else:
253
+ return list(self._catalog.keys())
254
+
255
+ def _repr_params(self) -> list[str]:
256
+ params = [f'{len(self.keys())} items']
257
+ if not self.is_active():
258
+ params.append('NOT ACTIVE')
259
+ if self.is_read_only():
260
+ params.append('READ ONLY')
261
+ return params
262
+
263
+ def __repr__(self):
264
+ return f"{type(self).__name__}({', '.join(self._repr_params())})"
265
+
266
+
267
+ def check_meta_mismatches(caches: Iterable[CacheCore], key=None) -> dict[str, tuple[CachedItemMeta, ...]]:
268
+ if key is not None and not isinstance(key, str):
269
+ raise TypeError('key is not a string')
270
+ for cache in caches:
271
+ if not isinstance(cache, CacheCore):
272
+ raise TypeError('caches must be an iterable containing Cache instances')
273
+ mismatches = {}
274
+ if key is None:
275
+ keys = set(k for cache in caches for k in cache.keys())
276
+ else:
277
+ keys = [key]
278
+ for k in keys:
279
+ metas = [cache.get_meta(k) for cache in caches if k in cache]
280
+ if len(metas) > 1:
281
+ first_meta = metas[0]
282
+ if any(meta != first_meta for meta in metas[1:]):
283
+ mismatches[k] = tuple(metas)
284
+ return mismatches
285
+
286
+
287
+ class Serializer(metaclass=ABCMeta):
288
+ default_ext = None
289
+
290
+ def __init__(self, ext=None):
291
+ if ext is None:
292
+ self.ext = self.default_ext
293
+ else:
294
+ self.ext = ext
295
+
296
+ @abstractmethod
297
+ def read(self, path):
298
+ pass
299
+
300
+ @abstractmethod
301
+ def write(self, path, obj):
302
+ pass
@@ -0,0 +1,236 @@
1
+ import json
2
+ import os
3
+ import pickle
4
+ from datetime import datetime, UTC
5
+ from functools import partial
6
+ from glob import glob
7
+ from pathlib import Path
8
+ from typing import Optional, Self
9
+
10
+ from yes3.caching.base import Cache, CacheDictCatalog, CachedItemMeta, Serializer, CacheReaderWriter
11
+
12
+
13
+ class PickleSerializer(Serializer):
14
+ default_ext = 'pkl'
15
+
16
+ def read(self, path):
17
+ with open(path, 'rb') as f:
18
+ return pickle.load(f)
19
+
20
+ def write(self, path, obj):
21
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
22
+ with open(path, 'wb') as f:
23
+ pickle.dump(obj, f)
24
+
25
+
26
+ class JsonSerializer(Serializer):
27
+ default_ext = 'json'
28
+
29
+ def read(self, path) -> dict:
30
+ with open(path, 'r') as f:
31
+ return json.load(f)
32
+
33
+ def write(self, path, obj: dict):
34
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
35
+ with open(path, 'w') as f:
36
+ json.dump(obj, f)
37
+
38
+
39
+ class JsonMetaSerializer(JsonSerializer):
40
+ default_ext = 'meta'
41
+
42
+ def read(self, path) -> CachedItemMeta:
43
+ meta_dict = super().read(path)
44
+ return CachedItemMeta(**meta_dict)
45
+
46
+ def write(self, path, meta: CachedItemMeta):
47
+ super().write(path, meta.to_dict())
48
+
49
+
50
+ def _get_serializer(serializer: str | Serializer, ext=None) -> Serializer:
51
+ if isinstance(serializer, type):
52
+ serializer = serializer(ext)
53
+
54
+ if isinstance(serializer, str):
55
+ if serializer.lstrip('.').lower() in {'pkl', 'pickle'}:
56
+ return PickleSerializer(ext)
57
+ elif serializer.lstrip('.').lower() == 'json':
58
+ return JsonSerializer(ext)
59
+ else:
60
+ raise NotImplementedError(f"Serializer not implemented for file type '{serializer}'")
61
+ elif isinstance(serializer, Serializer):
62
+ if ext is not None:
63
+ serializer.ext = ext
64
+ return serializer
65
+ else:
66
+ raise TypeError(
67
+ f'file_type must be a string or a Serializer subclass, but got type {type(serializer).__name__}')
68
+
69
+
70
+ def _with_ext(path, ext: Optional[str]):
71
+ if ext is None:
72
+ return path
73
+ if not ext.startswith('.'):
74
+ ext = f'.{ext}'
75
+ path_str = str(path)
76
+ if path_str.endswith(ext):
77
+ return path
78
+ else:
79
+ try:
80
+ return type(path)(path_str + ext)
81
+ except (ValueError, TypeError):
82
+ return Path(path_str + ext)
83
+
84
+
85
+ class LocalReaderWriter(CacheReaderWriter):
86
+ def __init__(
87
+ self, path: str | Path,
88
+ object_serializer: str | Serializer = PickleSerializer(),
89
+ meta_serializer: str | Serializer = JsonMetaSerializer(),
90
+ ):
91
+ self.path = Path(path)
92
+ self.obj_serializer = _get_serializer(object_serializer)
93
+ self.meta_serializer = _get_serializer(meta_serializer)
94
+
95
+ def clone(self, path: str | Path) -> Self:
96
+ return type(self)(path, object_serializer=self.obj_serializer, meta_serializer=self.meta_serializer)
97
+
98
+ def key2path(self, key: str, meta=False) -> Path:
99
+ if meta:
100
+ return self.path / _with_ext(key, self.meta_serializer.ext)
101
+ else:
102
+ return self.path / _with_ext(key, self.obj_serializer.ext)
103
+
104
+ def path2key(self, path: str | Path) -> str:
105
+ path = Path(path)
106
+ rel_path = path.relative_to(self.path)
107
+ return rel_path.stem
108
+
109
+ def read(self, key: str):
110
+ path = self.key2path(key)
111
+ print(f"Reading cached item '{key}' at {path}")
112
+ return self.obj_serializer.read(path)
113
+
114
+ def _build_meta(self, path, key=None) -> CachedItemMeta:
115
+ if key is None:
116
+ key = self.path2key(path)
117
+ file_stat = os.stat(path)
118
+ rel_path = path.relative_to(self.path)
119
+ return CachedItemMeta(
120
+ key=key,
121
+ path=str(rel_path),
122
+ size=file_stat.st_size,
123
+ timestamp=datetime.fromtimestamp(file_stat.st_mtime, UTC),
124
+ )
125
+
126
+ def get_meta(self, key: str, rebuild=False) -> CachedItemMeta:
127
+ if rebuild:
128
+ obj_path = self.key2path(key)
129
+ meta_path = self.key2path(key, meta=True)
130
+ meta = self._build_meta(path=obj_path, key=key)
131
+ self.meta_serializer.write(meta_path, meta)
132
+ else:
133
+ meta_path = self.key2path(key, meta=True)
134
+ meta = self.meta_serializer.read(meta_path)
135
+ return meta
136
+
137
+ def write(self, key: str, obj, meta: Optional[CachedItemMeta] = None) -> CachedItemMeta:
138
+ obj_path = self.key2path(key)
139
+ print(f"Caching item '{key}' at {obj_path}")
140
+ self.obj_serializer.write(obj_path, obj)
141
+
142
+ meta_path = self.key2path(key, meta=True)
143
+ if meta is None:
144
+ meta = self._build_meta(path=obj_path, key=key)
145
+ self.meta_serializer.write(meta_path, meta)
146
+ return meta
147
+
148
+ def delete(self, key: str, meta_only=False):
149
+ path = self.key2path(key)
150
+ meta_path = self.key2path(key, meta=True)
151
+ if meta_only:
152
+ print(f"Deleting cached item '{key}' metadata at {meta_path}")
153
+ else:
154
+ print(f"Deleting cached item '{key}' at {path}")
155
+ os.remove(path)
156
+ os.remove(meta_path)
157
+
158
+
159
+ class LocalDiskCache(Cache):
160
+ @staticmethod
161
+ def _build_catalog_dict(reader_writer: LocalReaderWriter, rebuild_missing_meta=False) -> dict:
162
+ catalog_dict = {}
163
+ if os.path.exists(reader_writer.path):
164
+ data_ext = reader_writer.obj_serializer.ext.lstrip('.')
165
+ meta_ext = reader_writer.meta_serializer.ext.lstrip('.')
166
+ data_files = glob(str(reader_writer.path / f'*.{data_ext}'))
167
+ meta_files = glob(str(reader_writer.path / f'*.{meta_ext}'))
168
+ data_map = {Path(p).stem: p for p in data_files}
169
+ meta_map = {Path(p).stem: p for p in meta_files}
170
+ if data_map.keys() != meta_map.keys():
171
+ if rebuild_missing_meta:
172
+ print(f'WARNING: data and metadata files are not aligned for cache at {reader_writer.path}, '
173
+ 'rebuilding missing metadata files')
174
+ else:
175
+ raise RuntimeError(f'data and metadata files are not aligned for cache at {reader_writer.path}')
176
+ for key in data_map.keys():
177
+ catalog_dict[key] = reader_writer.get_meta(key, rebuild=(key not in meta_map and rebuild_missing_meta))
178
+ if len(catalog_dict.keys()) > 0:
179
+ print(f'{len(catalog_dict.keys())} cached items discovered at {reader_writer.path}')
180
+ return catalog_dict
181
+
182
+ @classmethod
183
+ def create(
184
+ cls,
185
+ path: str | Path,
186
+ obj_serializer: str | Serializer = PickleSerializer(),
187
+ meta_serializer: str | Serializer = JsonMetaSerializer(),
188
+ reader_writer: Optional[CacheReaderWriter] = None,
189
+ rebuild_missing_meta=False,
190
+ **kwargs,
191
+ ):
192
+ if reader_writer is None:
193
+ reader_writer = LocalReaderWriter(path, obj_serializer, meta_serializer)
194
+ elif not isinstance(reader_writer, LocalReaderWriter):
195
+ raise TypeError(f'`reader_writer` must be a {LocalReaderWriter.__name__} instance')
196
+ elif reader_writer.path != path:
197
+ reader_writer = reader_writer.clone(path)
198
+ catalog_builder = partial(cls._build_catalog_dict, reader_writer=reader_writer,
199
+ rebuild_missing_meta=rebuild_missing_meta)
200
+ catalog = CacheDictCatalog(catalog_builder=catalog_builder)
201
+ return cls(catalog, reader_writer, **kwargs)
202
+
203
+ @property
204
+ def path(self) -> Path:
205
+ return self._reader_writer.path
206
+
207
+ def subcache(self, rel_path: str) -> Self:
208
+ path = self.path / rel_path
209
+ kwargs = dict(active=self.is_active(), read_only=self.is_read_only())
210
+ return type(self).create(path, reader_writer=self._reader_writer, **kwargs)
211
+
212
+ def clear(self, force=False) -> Self:
213
+ if self.is_active() and len(self.keys()) > 0:
214
+ if not force:
215
+ raise RuntimeError(f'Clearing this {type(self).__name__} ({self.path}) requires specifying force=True')
216
+ print(f'Deleting {len(self.keys())} item(s) from cache at {self.path}')
217
+ for key in self.keys():
218
+ self.remove(key)
219
+ new_cache = type(self).create(self.path, reader_writer=self._reader_writer)
220
+ self.__init__(new_cache._catalog, new_cache._reader_writer, active=self._active, read_only=self._read_only)
221
+ return self
222
+
223
+ def clear_meta(self, force=False) -> Self:
224
+ if self.is_active() and len(self.keys()) > 0:
225
+ if not force:
226
+ raise RuntimeError(f'Clearing this {type(self).__name__} metadata ({self.path}) requires specifying '
227
+ 'force=True')
228
+ print(f'Deleting {len(self.keys())} item(s) from cache at {self.path}')
229
+ for key in self.keys():
230
+ self.remove(key, meta_only=True)
231
+ return self
232
+
233
+ def _repr_params(self) -> list[str]:
234
+ params = super()._repr_params()
235
+ params.insert(0, str(self.path))
236
+ return params
@@ -0,0 +1,74 @@
1
+ from datetime import datetime, UTC
2
+ from typing import Any, Optional, Self
3
+
4
+ from yes3.caching.base import CacheCore, CachedItemMeta, raise_not_found, _NotSpecified
5
+
6
+
7
+ class MemoryCache(CacheCore):
8
+ def __init__(self, active=True, read_only=False):
9
+ super().__init__(active=active, read_only=read_only)
10
+ self._data: dict[str, Any] = {}
11
+ self._meta: dict[str, CachedItemMeta] = {}
12
+
13
+ def __contains__(self, key: str):
14
+ if not self.is_active():
15
+ return False
16
+ found = key in self._data
17
+ if found and key not in self._meta:
18
+ raise RuntimeError(f"data exists, but no metadata found, for key '{key}' in {type(self).__name__}")
19
+ return found
20
+
21
+ def get(self, key: str, default=_NotSpecified):
22
+ if not self.is_active() or key not in self:
23
+ if default is _NotSpecified:
24
+ raise_not_found(key)
25
+ else:
26
+ return default
27
+ return self._data[key]
28
+
29
+ def get_meta(self, key: str) -> CachedItemMeta:
30
+ if not self.is_active() or key not in self:
31
+ raise_not_found(key)
32
+ return self._meta[key]
33
+
34
+ def put(self, key: str, obj, *, update=False, meta: Optional[CachedItemMeta] = None) -> Self:
35
+ if self.is_read_only():
36
+ raise TypeError('Cache is in read only mode')
37
+ if self.is_active():
38
+ if key in self and not update:
39
+ raise ValueError(f"key '{key}' already exists in cache; use 'update' to overwrite")
40
+ if meta is None:
41
+ meta = CachedItemMeta(key=key, timestamp=datetime.now(UTC), path=None, size=None)
42
+ self._meta[key] = meta
43
+ self._data[key] = obj
44
+ else:
45
+ print(f"WARNING: {type(self).__name__} is not active")
46
+ return self
47
+
48
+ def remove(self, key: str) -> Self:
49
+ if self.is_active():
50
+ if key in self:
51
+ if self.is_read_only():
52
+ raise TypeError('Cache is in read only mode')
53
+ self._data.pop(key)
54
+ self._meta.pop(key)
55
+ else:
56
+ print(f"WARNING: {type(self).__name__} is not active")
57
+ return self
58
+
59
+ def keys(self) -> list[Any]:
60
+ if not self.is_active():
61
+ return []
62
+ else:
63
+ return list(self._data.keys())
64
+
65
+ def clear(self, force=False) -> Self:
66
+ if self.is_active():
67
+ if len(self.keys()) > 0:
68
+ if not force:
69
+ raise RuntimeError(f'Clearing this {type(self).__name__} requires specifying force=True')
70
+ self._data: dict[str, Any] = {}
71
+ self._meta: dict[str, CachedItemMeta] = {}
72
+ else:
73
+ print(f"WARNING: {type(self).__name__} is not active")
74
+ return self