PyperCache 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ """Disk-backed dictionary that splits its contents across multiple pickle chunk files.
2
+
3
+ A JSON manifest file tracks which chunk holds each key, allowing large datasets
4
+ to be stored and accessed without loading everything into memory at once.
5
+
6
+ Typical usage::
7
+
8
+ # Build from an in-memory dict
9
+ store = ChunkedDictionary.from_dict(data, "/path/to/dir", chunk_size_in_bytes=1_000_000)
10
+
11
+ # Re-open an existing store
12
+ store = ChunkedDictionary.from_disk("/path/to/dir/chunks.manifest")
13
+
14
+ # Use like a regular dict
15
+ store["my_key"] = {"some": "value"}
16
+ value = store["my_key"]
17
+ """
18
+
19
+ import json
20
+ import math
21
+ import os
22
+ import sys
23
+ import threading
24
+ from functools import cached_property as lazy_property
25
+ from pathlib import Path
26
+ from typing import Any, Dict, Generator, Iterator, List, Optional
27
+
28
+ from PyperCache.utils.fs import ensure_dirs_exist
29
+ from PyperCache.utils.serialization import PickleStore
30
+
31
+ # Private sentinel: distinguishes "no default supplied" from None in get().
32
+ _UNSET = object()
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Helpers
37
+ # ---------------------------------------------------------------------------
38
+
39
+ def get_size_of_dict(d: dict) -> int:
40
+ """Return the byte-length of *d* when serialised to a compact JSON string."""
41
+ return len(json.dumps(d))
42
+
43
+
44
+ def chunk_dictionary(
45
+ data: dict,
46
+ chunk_size_in_bytes: int,
47
+ ) -> Generator[dict, None, None]:
48
+ """Split *data* into sub-dictionaries whose estimated size stays under the limit."""
49
+ chunk: dict = {}
50
+ total_size: int = 0
51
+
52
+ for key, value in data.items():
53
+ item_size = sys.getsizeof(key) + get_size_of_dict(value)
54
+
55
+ if total_size + item_size > chunk_size_in_bytes:
56
+ yield chunk
57
+ chunk = {}
58
+ total_size = 0
59
+
60
+ chunk[key] = value
61
+ total_size += item_size
62
+
63
+ if chunk or not data:
64
+ yield chunk
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Manifest
69
+ # ---------------------------------------------------------------------------
70
+
71
+ class ChunkedDictionaryManifest:
72
+ """Reads and writes the JSON manifest that describes a :class:`ChunkedDictionary`."""
73
+
74
+ def __init__(self, manifest_filepath: str) -> None:
75
+ self.lock = threading.Lock()
76
+ self.filepath: Path = Path(manifest_filepath)
77
+
78
+ with open(manifest_filepath, "r") as fp:
79
+ manifest: dict = json.load(fp)
80
+
81
+ self.chunks_map: Dict[str, str] = manifest["chunks_map"]
82
+ self.chunk_size_in_bytes: int = manifest["chunk_size_in_bytes"]
83
+ self.chunks_path: Path = Path(manifest["chunks_path"])
84
+ self.chunks_count: int = manifest["chunks_count"]
85
+
86
+ def is_chunk_filepath(self, file: str) -> bool:
87
+ return file.startswith(str(self.chunks_path))
88
+
89
+ @staticmethod
90
+ def get_chunk_filename(index: int) -> str:
91
+ return f"{index}-chunk.pkl"
92
+
93
+ @staticmethod
94
+ def get_chunk_index_from_filename(filename: str) -> int:
95
+ return int(filename.replace("-chunk.pkl", ""))
96
+
97
+ def remove_unused_chunks(self) -> None:
98
+ with self.lock:
99
+ for chunk_filename in os.listdir(self.chunks_path):
100
+ if not chunk_filename.endswith("-chunk.pkl"):
101
+ continue
102
+ index = ChunkedDictionaryManifest.get_chunk_index_from_filename(chunk_filename)
103
+ if index + 1 > self.chunks_count:
104
+ filepath = self.chunks_path / chunk_filename
105
+ os.remove(filepath)
106
+
107
+ def erase_all_chunks_nonreversable(self) -> None:
108
+ with self.lock:
109
+ for chunk_filename in os.listdir(self.chunks_path):
110
+ if chunk_filename.endswith(".pkl"):
111
+ filepath = self.chunks_path / chunk_filename
112
+ os.remove(filepath)
113
+ self.chunks_map = {}
114
+ self.chunks_count = 0
115
+
116
+ def save(self) -> None:
117
+ with self.lock:
118
+ manifest = {
119
+ "chunk_size_in_bytes": self.chunk_size_in_bytes,
120
+ "chunks_path": str(self.chunks_path),
121
+ "chunks_count": self.chunks_count,
122
+ "chunks_map": self.chunks_map,
123
+ }
124
+ with open(self.filepath, "w") as fp:
125
+ json.dump(manifest, fp, indent=2)
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # ChunkedDictionary
130
+ # ---------------------------------------------------------------------------
131
+
132
+ class ChunkedDictionary:
133
+ """A disk-backed dictionary whose entries are spread across pickle chunk files."""
134
+
135
+ def __init__(self, manifest_filepath: str) -> None:
136
+ self.lock = threading.Lock()
137
+ self.manifest = ChunkedDictionaryManifest(manifest_filepath)
138
+ self.manifest.remove_unused_chunks()
139
+ self.chunks: Dict[str, dict] = {}
140
+
141
+ # ------------------------------------------------------------------
142
+ # Class-level constructors
143
+ # ------------------------------------------------------------------
144
+
145
+ @classmethod
146
+ def from_dict(
147
+ cls,
148
+ data: dict,
149
+ directory: str | Path,
150
+ chunk_size_in_bytes: int,
151
+ ) -> "ChunkedDictionary":
152
+ """Build a new ChunkedDictionary on disk from an in-memory dict."""
153
+ directory = Path(directory)
154
+ ensure_dirs_exist(str(directory))
155
+ manifest_filepath = str(directory / "chunks.manifest")
156
+
157
+ chunks_path = directory
158
+ chunks = list(chunk_dictionary(data, chunk_size_in_bytes))
159
+
160
+ chunks_map: Dict[str, str] = {}
161
+ for i, chunk in enumerate(chunks):
162
+ chunk_filename = ChunkedDictionaryManifest.get_chunk_filename(i)
163
+ chunk_filepath = chunks_path / chunk_filename
164
+ PickleStore.save_object(chunk, str(chunk_filepath))
165
+ for key in chunk:
166
+ chunks_map[key] = chunk_filename
167
+
168
+ manifest = {
169
+ "chunk_size_in_bytes": chunk_size_in_bytes,
170
+ "chunks_path": str(chunks_path),
171
+ "chunks_count": len(chunks),
172
+ "chunks_map": chunks_map,
173
+ }
174
+ with open(manifest_filepath, "w") as fp:
175
+ json.dump(manifest, fp, indent=2)
176
+
177
+ return cls(manifest_filepath)
178
+
179
+ @classmethod
180
+ def from_disk(cls, manifest_filepath: str | Path) -> "ChunkedDictionary":
181
+ """Open an existing ChunkedDictionary from its manifest file."""
182
+ return cls(str(manifest_filepath))
183
+
184
+ @staticmethod
185
+ def directory_contains_chunked_dictionary(directory: str | Path) -> bool:
186
+ """Return True if *directory* contains a valid chunks.manifest file."""
187
+ return (Path(directory) / "chunks.manifest").exists()
188
+
189
+ # ------------------------------------------------------------------
190
+ # Bulk access
191
+ # ------------------------------------------------------------------
192
+
193
+ def data(self) -> dict:
194
+ return {k: self[k] for k in self.keys()}
195
+
196
+ def erase_everything(self) -> None:
197
+ with self.lock:
198
+ self.chunks = {}
199
+ self.manifest.erase_all_chunks_nonreversable()
200
+ self.manifest.save()
201
+
202
+ # ------------------------------------------------------------------
203
+ # dict-like interface
204
+ # ------------------------------------------------------------------
205
+
206
+ def __contains__(self, key: str) -> bool:
207
+ return key in self.keys()
208
+
209
+ def __len__(self) -> int:
210
+ return len(self.keys())
211
+
212
+ def items(self) -> Iterator:
213
+ return self.data().items()
214
+
215
+ def keys(self) -> List[str]:
216
+ return list(self.manifest.chunks_map.keys())
217
+
218
+ def get(self, key: str, default_value: Any = _UNSET) -> Any:
219
+ if default_value is not _UNSET and key not in self.keys():
220
+ return default_value
221
+ return self[key]
222
+
223
+ def __getitem__(self, key: str) -> Any:
224
+ chunk_filename: str = self.manifest.chunks_map[key]
225
+ chunk = self.get_chunk(chunk_filename)
226
+ return chunk[key]
227
+
228
+ def __setitem__(self, key: str, value: Any) -> None:
229
+ chunk_filename: Optional[str] = None
230
+
231
+ if key in self.manifest.chunks_map:
232
+ chunk_filename = self.manifest.chunks_map[key]
233
+ chunk = self.get_chunk(chunk_filename)
234
+ with self.lock:
235
+ chunk[key] = value
236
+ else:
237
+ last_chunk_index = self.manifest.chunks_count - 1
238
+
239
+ if last_chunk_index == -1:
240
+ last_chunk_index = 0
241
+ last_chunk_filename = self.create_new_chunk()
242
+ else:
243
+ last_chunk_filename = ChunkedDictionaryManifest.get_chunk_filename(last_chunk_index)
244
+
245
+ last_chunk = self.get_chunk(last_chunk_filename)
246
+ last_chunk_size = get_size_of_dict(last_chunk)
247
+
248
+ if last_chunk_size + get_size_of_dict({key: value}) < self.manifest.chunk_size_in_bytes:
249
+ chunk_filename = last_chunk_filename
250
+ else:
251
+ chunk_filename = self.create_new_chunk()
252
+
253
+ with self.lock:
254
+ self.manifest.chunks_map[key] = chunk_filename
255
+ self.chunks[chunk_filename][key] = value
256
+
257
+ assert chunk_filename is not None
258
+ self.save_chunk(chunk_filename)
259
+
260
+ # ------------------------------------------------------------------
261
+ # Chunk management
262
+ # ------------------------------------------------------------------
263
+
264
+ def create_new_chunk(self) -> str:
265
+ with self.lock:
266
+ index = self.manifest.chunks_count
267
+ chunk_filename = ChunkedDictionaryManifest.get_chunk_filename(index)
268
+ chunk_filepath = self.manifest.chunks_path / chunk_filename
269
+ PickleStore.save_object({}, str(chunk_filepath))
270
+ self.chunks[chunk_filename] = {}
271
+ self.manifest.chunks_count += 1
272
+ return chunk_filename
273
+
274
+ def get_chunk(self, chunk_filename: str) -> dict:
275
+ chunk_filename = str(chunk_filename)
276
+ assert not self.manifest.is_chunk_filepath(chunk_filename)
277
+ if chunk_filename not in self.chunks:
278
+ chunk_filepath = str(self.manifest.chunks_path / chunk_filename)
279
+ self.chunks[chunk_filename] = PickleStore.load_object(chunk_filepath)
280
+ return self.chunks[chunk_filename]
281
+
282
+ def save_chunk(self, chunk_filename: str) -> None:
283
+ with self.lock:
284
+ chunk = self.chunks[chunk_filename]
285
+ chunk_filepath = self.manifest.chunks_path / chunk_filename
286
+ PickleStore.save_object(chunk, str(chunk_filepath))
287
+ self.manifest.save()
288
+
289
+ def resize_data_chunks(self, chunk_size_in_bytes: int) -> None:
290
+ all_data = self.data()
291
+ manifest_filepath = str(self.manifest.filepath)
292
+ chunks_path = str(self.manifest.chunks_path)
293
+ self.erase_everything()
294
+ self.manifest.chunk_size_in_bytes = chunk_size_in_bytes
295
+ new = ChunkedDictionary.from_dict(all_data, chunks_path, chunk_size_in_bytes)
296
+ self.manifest = new.manifest
297
+ self.chunks = new.chunks
@@ -0,0 +1,40 @@
1
+ """Factory: map a file extension to the appropriate storage backend class."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, Type
5
+
6
+ from PyperCache.storage.base import StorageMechanism
7
+ from PyperCache.storage.backends import ChunkedStorage, JSONStorage, PickleStorage
8
+ from PyperCache.storage.sqlite_storage import SQLiteStorage
9
+
10
+
11
+ # Maps file extensions to their corresponding storage backend class.
12
+ _EXTENSION_TO_STORAGE: Dict[str, Type[StorageMechanism]] = {
13
+ ".manifest": ChunkedStorage,
14
+ ".json": JSONStorage,
15
+ ".pkl": PickleStorage,
16
+ ".db": SQLiteStorage, # SQLite — zero-cost, stdlib, no server required
17
+ }
18
+
19
+
20
+ def get_storage_mechanism(filepath: str) -> Type[StorageMechanism]:
21
+ """Return the :class:`StorageMechanism` subclass appropriate for *filepath*.
22
+
23
+ The backend is selected solely from the file extension.
24
+
25
+ Args:
26
+ filepath: Path to the cache store file.
27
+
28
+ Returns:
29
+ The matching :class:`StorageMechanism` subclass (*not* an instance).
30
+
31
+ Raises:
32
+ ValueError: If no backend supports the given file extension.
33
+ """
34
+ extension = Path(filepath).suffix.lower()
35
+ mechanism = _EXTENSION_TO_STORAGE.get(extension)
36
+ if mechanism is None:
37
+ raise ValueError(
38
+ f"No storage mechanism found for file extension: {extension!r}"
39
+ )
40
+ return mechanism