persidict 0.38.0__py3-none-any.whl → 0.103.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of persidict might be problematic. Click here for more details.
- persidict/__init__.py +41 -24
- persidict/basic_s3_dict.py +595 -0
- persidict/cached_appendonly_dict.py +247 -0
- persidict/cached_mutable_dict.py +248 -0
- persidict/empty_dict.py +171 -0
- persidict/file_dir_dict.py +130 -122
- persidict/local_dict.py +502 -0
- persidict/overlapping_multi_dict.py +23 -15
- persidict/persi_dict.py +281 -148
- persidict/s3_dict_file_dir_cached.py +215 -0
- persidict/{s3_dict.py → s3_dict_legacy.py} +111 -90
- persidict/safe_chars.py +13 -0
- persidict/safe_str_tuple.py +28 -6
- persidict/singletons.py +232 -0
- persidict/write_once_dict.py +47 -30
- {persidict-0.38.0.dist-info → persidict-0.103.0.dist-info}/METADATA +34 -24
- persidict-0.103.0.dist-info/RECORD +19 -0
- {persidict-0.38.0.dist-info → persidict-0.103.0.dist-info}/WHEEL +1 -1
- persidict/.DS_Store +0 -0
- persidict/jokers.py +0 -99
- persidict-0.38.0.dist-info/RECORD +0 -14
persidict/file_dir_dict.py
CHANGED
|
@@ -2,15 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
FileDirDict stores each key-value pair in a separate file under a base
|
|
4
4
|
directory. Keys determine directory structure and filename; values are
|
|
5
|
-
serialized depending on ``
|
|
5
|
+
serialized depending on ``serialization_format``.
|
|
6
6
|
|
|
7
|
-
-
|
|
7
|
+
- serialization_format="pkl" or "json": arbitrary Python objects via pickle/jsonpickle.
|
|
8
8
|
- any other value: strings are stored as plain text.
|
|
9
9
|
"""
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
12
|
import os
|
|
13
|
-
import pathlib
|
|
14
13
|
import random
|
|
15
14
|
import tempfile
|
|
16
15
|
import time
|
|
@@ -23,12 +22,10 @@ import jsonpickle.ext.pandas as jsonpickle_pandas
|
|
|
23
22
|
import parameterizable
|
|
24
23
|
from parameterizable import sort_dict_by_keys
|
|
25
24
|
|
|
26
|
-
from .
|
|
27
|
-
from .
|
|
28
|
-
from .safe_str_tuple import SafeStrTuple
|
|
25
|
+
from .singletons import Joker, EXECUTION_IS_COMPLETE
|
|
26
|
+
from .safe_str_tuple import SafeStrTuple, NonEmptySafeStrTuple
|
|
29
27
|
from .safe_str_tuple_signing import sign_safe_str_tuple, unsign_safe_str_tuple
|
|
30
|
-
from .persi_dict import PersiDict, PersiDictKey,
|
|
31
|
-
|
|
28
|
+
from .persi_dict import PersiDict, PersiDictKey, NonEmptyPersiDictKey
|
|
32
29
|
|
|
33
30
|
if os.name == 'nt':
|
|
34
31
|
import msvcrt
|
|
@@ -52,30 +49,42 @@ if os.name == 'nt':
|
|
|
52
49
|
|
|
53
50
|
def add_long_path_prefix(path: str) -> str:
|
|
54
51
|
"""Add the '\\\\?\\' prefix to a path on Windows to support long paths.
|
|
52
|
+
|
|
53
|
+
Handles both regular paths and UNC paths correctly.
|
|
55
54
|
|
|
56
55
|
Args:
|
|
57
56
|
path (str): The original file or directory path.
|
|
58
57
|
|
|
59
58
|
Returns:
|
|
60
|
-
str: The modified path with the
|
|
59
|
+
str: The modified path with the appropriate prefix if on Windows
|
|
61
60
|
and not already present; otherwise, the original path.
|
|
61
|
+
UNC paths get '\\\\?\\UNC\\' prefix, regular paths get '\\\\?\\'.
|
|
62
62
|
"""
|
|
63
|
-
if
|
|
64
|
-
return f'\\\\?\\{path}'
|
|
65
|
-
else:
|
|
63
|
+
if path.startswith('\\\\?\\'):
|
|
66
64
|
return path
|
|
65
|
+
elif path.startswith('\\\\'):
|
|
66
|
+
# UNC path: \\server\share -> \\?\UNC\server\share
|
|
67
|
+
return f'\\\\?\\UNC\\{path[2:]}'
|
|
68
|
+
else:
|
|
69
|
+
return f'\\\\?\\{path}'
|
|
67
70
|
|
|
68
71
|
def drop_long_path_prefix(path: str) -> str:
|
|
69
72
|
"""Remove the '\\\\?\\' prefix from a path on Windows if present.
|
|
73
|
+
|
|
74
|
+
Handles both regular paths and UNC paths correctly.
|
|
70
75
|
|
|
71
76
|
Args:
|
|
72
77
|
path (str): The file or directory path, possibly with the '\\\\?\\' prefix.
|
|
73
78
|
|
|
74
79
|
Returns:
|
|
75
80
|
str: The path without the '\\\\?\\' prefix if it was present; otherwise,
|
|
76
|
-
the original path.
|
|
81
|
+
the original path. UNC paths are converted back from '\\\\?\\UNC\\'
|
|
82
|
+
format to '\\\\' format.
|
|
77
83
|
"""
|
|
78
|
-
if path.startswith('\\\\?\\'):
|
|
84
|
+
if path.startswith('\\\\?\\UNC\\'):
|
|
85
|
+
# UNC path: \\?\UNC\server\share -> \\server\share
|
|
86
|
+
return f'\\\\{path[8:]}'
|
|
87
|
+
elif path.startswith('\\\\?\\'):
|
|
79
88
|
return path[4:]
|
|
80
89
|
else:
|
|
81
90
|
return path
|
|
@@ -106,50 +115,45 @@ class FileDirDict(PersiDict):
|
|
|
106
115
|
"""
|
|
107
116
|
|
|
108
117
|
_base_dir:str
|
|
109
|
-
|
|
118
|
+
digest_len:int
|
|
110
119
|
|
|
111
120
|
def __init__(self
|
|
112
121
|
, base_dir: str = FILEDIRDICT_DEFAULT_BASE_DIR
|
|
113
|
-
,
|
|
114
|
-
,
|
|
115
|
-
, digest_len:int =
|
|
122
|
+
, serialization_format: str = "pkl"
|
|
123
|
+
, append_only:bool = False
|
|
124
|
+
, digest_len:int = 1
|
|
116
125
|
, base_class_for_values: Optional[type] = None):
|
|
117
126
|
"""Initialize a filesystem-backed persistent dictionary.
|
|
118
127
|
|
|
119
128
|
Args:
|
|
120
129
|
base_dir (str): Base directory where all files are stored. Created
|
|
121
130
|
if it does not exist.
|
|
122
|
-
|
|
131
|
+
serialization_format (str): File extension/format to use for stored values.
|
|
123
132
|
- "pkl" or "json": arbitrary Python objects are supported.
|
|
124
133
|
- any other value: only strings are supported and stored as text.
|
|
125
|
-
|
|
134
|
+
append_only (bool): If True, existing items cannot be modified
|
|
126
135
|
or deleted.
|
|
127
136
|
digest_len (int): Length of a hash suffix appended to each key path
|
|
128
137
|
element to avoid case-insensitive collisions. Use 0 to disable.
|
|
129
138
|
If you decide to enable it (not 0), we recommend at least 4.
|
|
130
139
|
base_class_for_values (Optional[type]): Optional base class that all
|
|
131
140
|
stored values must be instances of. If provided and not ``str``,
|
|
132
|
-
then
|
|
141
|
+
then serialization_format must be either "pkl" or "json".
|
|
133
142
|
|
|
134
143
|
Raises:
|
|
135
|
-
ValueError: If
|
|
144
|
+
ValueError: If serialization_format contains unsafe characters; or
|
|
136
145
|
if configuration is inconsistent (e.g., non-str values
|
|
137
|
-
with unsupported
|
|
146
|
+
with unsupported serialization_format).
|
|
138
147
|
RuntimeError: If base_dir cannot be created or is not a directory.
|
|
139
148
|
"""
|
|
140
149
|
|
|
141
|
-
super().__init__(
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
if file_type != replace_unsafe_chars(file_type, ""):
|
|
146
|
-
raise ValueError("file_type contains unsafe characters")
|
|
147
|
-
self.file_type = file_type
|
|
150
|
+
super().__init__(append_only=append_only,
|
|
151
|
+
base_class_for_values=base_class_for_values,
|
|
152
|
+
serialization_format=serialization_format)
|
|
148
153
|
|
|
149
|
-
if
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
raise ValueError("For non-string values file_type must be either 'pkl' or 'json'.")
|
|
154
|
+
if digest_len < 0:
|
|
155
|
+
raise ValueError("digest_len must be non-negative")
|
|
156
|
+
self.digest_len = digest_len
|
|
153
157
|
|
|
154
158
|
base_dir = str(base_dir)
|
|
155
159
|
self._base_dir = os.path.abspath(base_dir)
|
|
@@ -170,32 +174,20 @@ class FileDirDict(PersiDict):
|
|
|
170
174
|
in the standard dict API.
|
|
171
175
|
|
|
172
176
|
Returns:
|
|
173
|
-
dict: A mapping of parameter names to values including base_dir
|
|
174
|
-
|
|
177
|
+
dict: A mapping of parameter names to values including base_dir
|
|
178
|
+
merged with the base PersiDict parameters.
|
|
175
179
|
"""
|
|
176
180
|
params = PersiDict.get_params(self)
|
|
177
181
|
additional_params = dict(
|
|
178
|
-
base_dir=self.base_dir
|
|
179
|
-
|
|
182
|
+
base_dir=self.base_dir,
|
|
183
|
+
digest_len=self.digest_len)
|
|
180
184
|
params.update(additional_params)
|
|
181
185
|
sorted_params = sort_dict_by_keys(params)
|
|
182
186
|
return sorted_params
|
|
183
187
|
|
|
184
188
|
|
|
185
189
|
@property
|
|
186
|
-
def
|
|
187
|
-
"""Return dictionary's URL.
|
|
188
|
-
|
|
189
|
-
This property is absent in the original dict API.
|
|
190
|
-
|
|
191
|
-
Returns:
|
|
192
|
-
str: URL of the underlying storage in the form "file://<abs_path>".
|
|
193
|
-
"""
|
|
194
|
-
return pathlib.Path(self._base_dir).resolve().as_uri()
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@property
|
|
198
|
-
def base_dir(self) -> str:
|
|
190
|
+
def base_dir(self) -> str|None:
|
|
199
191
|
"""Return dictionary's base directory.
|
|
200
192
|
|
|
201
193
|
This property is absent in the original dict API.
|
|
@@ -220,7 +212,7 @@ class FileDirDict(PersiDict):
|
|
|
220
212
|
code paths.
|
|
221
213
|
"""
|
|
222
214
|
|
|
223
|
-
suffix = "." + self.
|
|
215
|
+
suffix = "." + self.serialization_format
|
|
224
216
|
return sum(1 for _, _, files in os.walk(self._base_dir)
|
|
225
217
|
for f in files if f.endswith(suffix))
|
|
226
218
|
|
|
@@ -229,21 +221,24 @@ class FileDirDict(PersiDict):
|
|
|
229
221
|
"""Remove all elements from the dictionary.
|
|
230
222
|
|
|
231
223
|
Raises:
|
|
232
|
-
KeyError: If
|
|
224
|
+
KeyError: If append_only is True.
|
|
233
225
|
"""
|
|
234
226
|
|
|
235
|
-
if self.
|
|
227
|
+
if self.append_only:
|
|
236
228
|
raise KeyError("Can't clear a dict that contains immutable items")
|
|
237
229
|
|
|
238
230
|
# we can't use shutil.rmtree() because
|
|
239
231
|
# there may be overlapping dictionaries
|
|
240
|
-
# with different
|
|
232
|
+
# with different serialization_format-s
|
|
241
233
|
for subdir_info in os.walk(self._base_dir, topdown=False):
|
|
242
234
|
(subdir_name, _, files) = subdir_info
|
|
243
|
-
suffix = "." + self.
|
|
235
|
+
suffix = "." + self.serialization_format
|
|
244
236
|
for f in files:
|
|
245
237
|
if f.endswith(suffix):
|
|
246
|
-
|
|
238
|
+
try:
|
|
239
|
+
os.remove(os.path.join(subdir_name, f))
|
|
240
|
+
except OSError:
|
|
241
|
+
continue
|
|
247
242
|
if (subdir_name != self._base_dir) and (
|
|
248
243
|
len(os.listdir(subdir_name)) == 0 ):
|
|
249
244
|
try:
|
|
@@ -262,7 +257,7 @@ class FileDirDict(PersiDict):
|
|
|
262
257
|
|
|
263
258
|
Transforms a SafeStrTuple into either a directory path or a file path
|
|
264
259
|
inside this dictionary's base directory. When is_file_path is True, the
|
|
265
|
-
final component is treated as a filename with the configured
|
|
260
|
+
final component is treated as a filename with the configured serialization_format
|
|
266
261
|
extension. When create_subdirs is True, missing intermediate directories
|
|
267
262
|
are created.
|
|
268
263
|
|
|
@@ -272,7 +267,7 @@ class FileDirDict(PersiDict):
|
|
|
272
267
|
create_subdirs (bool): If True, create any missing intermediate
|
|
273
268
|
directories.
|
|
274
269
|
is_file_path (bool): If True, return a file path ending with
|
|
275
|
-
".{
|
|
270
|
+
".{serialization_format}"; otherwise return just the directory path for
|
|
276
271
|
the key prefix.
|
|
277
272
|
|
|
278
273
|
Returns:
|
|
@@ -293,7 +288,7 @@ class FileDirDict(PersiDict):
|
|
|
293
288
|
os.makedirs(path_for_makedirs, exist_ok=True)
|
|
294
289
|
|
|
295
290
|
if is_file_path:
|
|
296
|
-
file_name = key_components[-1] + "." + self.
|
|
291
|
+
file_name = key_components[-1] + "." + self.serialization_format
|
|
297
292
|
final_path = os.path.join(dir_path, file_name)
|
|
298
293
|
else:
|
|
299
294
|
final_path = dir_path
|
|
@@ -305,7 +300,7 @@ class FileDirDict(PersiDict):
|
|
|
305
300
|
"""Convert an absolute filesystem path back into a SafeStrTuple key.
|
|
306
301
|
|
|
307
302
|
This function reverses _build_full_path, stripping base_dir, removing the
|
|
308
|
-
|
|
303
|
+
serialization_format extension if the path points to a file, and unsigning the key
|
|
309
304
|
components according to digest_len.
|
|
310
305
|
|
|
311
306
|
Args:
|
|
@@ -321,7 +316,8 @@ class FileDirDict(PersiDict):
|
|
|
321
316
|
|
|
322
317
|
# Remove the base directory from the path
|
|
323
318
|
if not full_path.startswith(self._base_dir):
|
|
324
|
-
raise ValueError(f"Path {full_path} is not
|
|
319
|
+
raise ValueError(f"Path {full_path} is not "
|
|
320
|
+
f"within base directory {self._base_dir}")
|
|
325
321
|
|
|
326
322
|
# Get the relative path
|
|
327
323
|
rel_path = os.path.relpath(
|
|
@@ -336,7 +332,7 @@ class FileDirDict(PersiDict):
|
|
|
336
332
|
path_components = rel_path.split(os.sep)
|
|
337
333
|
|
|
338
334
|
# If it's a file path, remove the file extension from the last component
|
|
339
|
-
suffix = "." + self.
|
|
335
|
+
suffix = "." + self.serialization_format
|
|
340
336
|
if path_components[-1].endswith(suffix):
|
|
341
337
|
path_components[-1] = path_components[-1][:-len(suffix)]
|
|
342
338
|
|
|
@@ -353,6 +349,7 @@ class FileDirDict(PersiDict):
|
|
|
353
349
|
"""Get a subdictionary containing items with the same prefix key.
|
|
354
350
|
|
|
355
351
|
For non-existing prefix key, an empty sub-dictionary is returned.
|
|
352
|
+
If the prefix is empty, the entire dictionary is returned.
|
|
356
353
|
This method is absent in the original dict API.
|
|
357
354
|
|
|
358
355
|
Args:
|
|
@@ -370,8 +367,8 @@ class FileDirDict(PersiDict):
|
|
|
370
367
|
is_file_path = False)
|
|
371
368
|
return FileDirDict(
|
|
372
369
|
base_dir= full_dir_path
|
|
373
|
-
,
|
|
374
|
-
,
|
|
370
|
+
, serialization_format=self.serialization_format
|
|
371
|
+
, append_only= self.append_only
|
|
375
372
|
, digest_len=self.digest_len
|
|
376
373
|
, base_class_for_values=self.base_class_for_values)
|
|
377
374
|
|
|
@@ -383,9 +380,9 @@ class FileDirDict(PersiDict):
|
|
|
383
380
|
file_name (str): Absolute path to the file to read.
|
|
384
381
|
|
|
385
382
|
Returns:
|
|
386
|
-
Any: The deserialized value according to
|
|
383
|
+
Any: The deserialized value according to serialization_format.
|
|
387
384
|
"""
|
|
388
|
-
file_open_mode = 'rb' if self.
|
|
385
|
+
file_open_mode = 'rb' if self.serialization_format == "pkl" else 'r'
|
|
389
386
|
if os.name == 'nt':
|
|
390
387
|
handle = CreateFileW(file_name, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, None, OPEN_EXISTING, 0, None)
|
|
391
388
|
if int(handle) == INVALID_HANDLE_VALUE:
|
|
@@ -394,27 +391,27 @@ class FileDirDict(PersiDict):
|
|
|
394
391
|
|
|
395
392
|
fd = None
|
|
396
393
|
try:
|
|
397
|
-
if self.
|
|
394
|
+
if self.serialization_format == "pkl":
|
|
398
395
|
fd_open_mode = os.O_RDONLY | os.O_BINARY
|
|
399
396
|
else:
|
|
400
397
|
fd_open_mode = os.O_RDONLY
|
|
401
398
|
fd = msvcrt.open_osfhandle(int(handle),fd_open_mode)
|
|
402
|
-
except
|
|
399
|
+
except:
|
|
403
400
|
CloseHandle(handle)
|
|
404
401
|
raise
|
|
405
402
|
|
|
406
403
|
try:
|
|
407
404
|
f = os.fdopen(fd, file_open_mode)
|
|
408
405
|
fd = None
|
|
409
|
-
except
|
|
406
|
+
except:
|
|
410
407
|
if fd is not None:
|
|
411
408
|
os.close(fd)
|
|
412
409
|
raise
|
|
413
410
|
|
|
414
411
|
with f:
|
|
415
|
-
if self.
|
|
412
|
+
if self.serialization_format == "pkl":
|
|
416
413
|
result = joblib.load(f)
|
|
417
|
-
elif self.
|
|
414
|
+
elif self.serialization_format == "json":
|
|
418
415
|
result = jsonpickle.loads(f.read())
|
|
419
416
|
else:
|
|
420
417
|
result = f.read()
|
|
@@ -422,9 +419,9 @@ class FileDirDict(PersiDict):
|
|
|
422
419
|
return result
|
|
423
420
|
else:
|
|
424
421
|
with open(file_name, file_open_mode) as f:
|
|
425
|
-
if self.
|
|
422
|
+
if self.serialization_format == "pkl":
|
|
426
423
|
result = joblib.load(f)
|
|
427
|
-
elif self.
|
|
424
|
+
elif self.serialization_format == "json":
|
|
428
425
|
result = jsonpickle.loads(f.read())
|
|
429
426
|
else:
|
|
430
427
|
result = f.read()
|
|
@@ -434,7 +431,7 @@ class FileDirDict(PersiDict):
|
|
|
434
431
|
def _read_from_file(self,file_name:str) -> Any:
|
|
435
432
|
"""Read a value from a file with retry/backoff for concurrency.
|
|
436
433
|
|
|
437
|
-
Validates that the configured
|
|
434
|
+
Validates that the configured serialization_format is compatible with the allowed
|
|
438
435
|
value types, then attempts to read the file using an exponential backoff
|
|
439
436
|
to better tolerate concurrent writers.
|
|
440
437
|
|
|
@@ -442,17 +439,17 @@ class FileDirDict(PersiDict):
|
|
|
442
439
|
file_name (str): Absolute path of the file to read.
|
|
443
440
|
|
|
444
441
|
Returns:
|
|
445
|
-
Any: The deserialized value according to
|
|
442
|
+
Any: The deserialized value according to serialization_format.
|
|
446
443
|
|
|
447
444
|
Raises:
|
|
448
|
-
ValueError: If
|
|
445
|
+
ValueError: If serialization_format is incompatible with non-string values.
|
|
449
446
|
Exception: Propagates the last exception if all retries fail.
|
|
450
447
|
"""
|
|
451
448
|
|
|
452
|
-
if not (self.
|
|
449
|
+
if not (self.serialization_format in {"pkl", "json"} or issubclass(
|
|
453
450
|
self.base_class_for_values, str)):
|
|
454
451
|
raise ValueError("When base_class_for_values is not str,"
|
|
455
|
-
+ "
|
|
452
|
+
+ " serialization_format must be pkl or json.")
|
|
456
453
|
|
|
457
454
|
n_retries = 12
|
|
458
455
|
# extra protections to better handle concurrent writes
|
|
@@ -461,7 +458,7 @@ class FileDirDict(PersiDict):
|
|
|
461
458
|
return self._read_from_file_impl(file_name)
|
|
462
459
|
except Exception as e:
|
|
463
460
|
if i < n_retries - 1:
|
|
464
|
-
time.sleep(random.uniform(0.01, 0.2) * (1.
|
|
461
|
+
time.sleep(random.uniform(0.01, 0.2) * (1.75 ** i))
|
|
465
462
|
else:
|
|
466
463
|
raise e
|
|
467
464
|
|
|
@@ -482,12 +479,12 @@ class FileDirDict(PersiDict):
|
|
|
482
479
|
fd, temp_path = tempfile.mkstemp(dir=dir_name, prefix=".__tmp__")
|
|
483
480
|
|
|
484
481
|
try:
|
|
485
|
-
if self.
|
|
482
|
+
if self.serialization_format == "pkl":
|
|
486
483
|
with open(fd, 'wb') as f:
|
|
487
484
|
joblib.dump(value, f, compress='lz4')
|
|
488
485
|
f.flush()
|
|
489
486
|
os.fsync(f.fileno())
|
|
490
|
-
elif self.
|
|
487
|
+
elif self.serialization_format == "json":
|
|
491
488
|
with open(fd, 'w') as f:
|
|
492
489
|
f.write(jsonpickle.dumps(value, indent=4))
|
|
493
490
|
f.flush()
|
|
@@ -506,8 +503,8 @@ class FileDirDict(PersiDict):
|
|
|
506
503
|
finally:
|
|
507
504
|
os.close(dir_fd)
|
|
508
505
|
elif os.name == 'nt':
|
|
509
|
-
|
|
510
|
-
|
|
506
|
+
# On Windows, try to flush directory metadata
|
|
507
|
+
# This is less reliable than on POSIX systems
|
|
511
508
|
try:
|
|
512
509
|
handle = CreateFileW(
|
|
513
510
|
dir_name,
|
|
@@ -539,7 +536,7 @@ class FileDirDict(PersiDict):
|
|
|
539
536
|
def _save_to_file(self, file_name:str, value:Any) -> None:
|
|
540
537
|
"""Save a value to a file with retry/backoff.
|
|
541
538
|
|
|
542
|
-
Ensures the configured
|
|
539
|
+
Ensures the configured serialization_format is compatible with value types and then
|
|
543
540
|
writes the value using an exponential backoff to better tolerate
|
|
544
541
|
concurrent readers/writers.
|
|
545
542
|
|
|
@@ -548,14 +545,14 @@ class FileDirDict(PersiDict):
|
|
|
548
545
|
value (Any): Value to serialize and save.
|
|
549
546
|
|
|
550
547
|
Raises:
|
|
551
|
-
ValueError: If
|
|
548
|
+
ValueError: If serialization_format is incompatible with non-string values.
|
|
552
549
|
Exception: Propagates the last exception if all retries fail.
|
|
553
550
|
"""
|
|
554
551
|
|
|
555
|
-
if not (self.
|
|
552
|
+
if not (self.serialization_format in {"pkl", "json"} or issubclass(
|
|
556
553
|
self.base_class_for_values, str)):
|
|
557
554
|
raise ValueError("When base_class_for_values is not str,"
|
|
558
|
-
+ "
|
|
555
|
+
+ " serialization_format must be pkl or json.")
|
|
559
556
|
|
|
560
557
|
n_retries = 12
|
|
561
558
|
# extra protections to better handle concurrent writes
|
|
@@ -565,33 +562,35 @@ class FileDirDict(PersiDict):
|
|
|
565
562
|
return
|
|
566
563
|
except Exception as e:
|
|
567
564
|
if i < n_retries - 1:
|
|
568
|
-
time.sleep(random.uniform(0.01, 0.2) * (1.
|
|
565
|
+
time.sleep(random.uniform(0.01, 0.2) * (1.75 ** i))
|
|
569
566
|
else:
|
|
570
567
|
raise e
|
|
571
568
|
|
|
572
569
|
|
|
573
|
-
def __contains__(self, key:
|
|
570
|
+
def __contains__(self, key:NonEmptyPersiDictKey) -> bool:
|
|
574
571
|
"""Check whether a key exists in the dictionary.
|
|
575
572
|
|
|
576
573
|
Args:
|
|
577
|
-
key (
|
|
574
|
+
key (NonEmptyPersiDictKey): Key (string or sequence of strings
|
|
575
|
+
or NonEmptySafeStrTuple).
|
|
578
576
|
|
|
579
577
|
Returns:
|
|
580
578
|
bool: True if a file for the key exists; False otherwise.
|
|
581
579
|
"""
|
|
582
|
-
key =
|
|
580
|
+
key = NonEmptySafeStrTuple(key)
|
|
583
581
|
filename = self._build_full_path(key)
|
|
584
582
|
return os.path.isfile(filename)
|
|
585
583
|
|
|
586
584
|
|
|
587
|
-
def __getitem__(self, key:
|
|
585
|
+
def __getitem__(self, key:NonEmptyPersiDictKey) -> Any:
|
|
588
586
|
"""Retrieve the value stored for a key.
|
|
589
587
|
|
|
590
588
|
Equivalent to obj[key]. Reads the corresponding file from the disk and
|
|
591
|
-
deserializes according to
|
|
589
|
+
deserializes according to serialization_format.
|
|
592
590
|
|
|
593
591
|
Args:
|
|
594
|
-
key (
|
|
592
|
+
key (NonEmptyPersiDictKey): Key (string or sequence of strings
|
|
593
|
+
or NonEmptySafeStrTuple).
|
|
595
594
|
|
|
596
595
|
Returns:
|
|
597
596
|
Any: The stored value.
|
|
@@ -601,7 +600,7 @@ class FileDirDict(PersiDict):
|
|
|
601
600
|
TypeError: If the deserialized value does not match base_class_for_values
|
|
602
601
|
when it is set.
|
|
603
602
|
"""
|
|
604
|
-
key =
|
|
603
|
+
key = NonEmptySafeStrTuple(key)
|
|
605
604
|
filename = self._build_full_path(key)
|
|
606
605
|
if not os.path.isfile(filename):
|
|
607
606
|
raise KeyError(f"File {filename} does not exist")
|
|
@@ -614,44 +613,45 @@ class FileDirDict(PersiDict):
|
|
|
614
613
|
return result
|
|
615
614
|
|
|
616
615
|
|
|
617
|
-
def __setitem__(self, key:
|
|
616
|
+
def __setitem__(self, key:NonEmptyPersiDictKey, value:Any):
|
|
618
617
|
"""Store a value for a key on the disk.
|
|
619
618
|
|
|
620
619
|
Interprets joker values KEEP_CURRENT and DELETE_CURRENT accordingly.
|
|
621
620
|
Validates value type if base_class_for_values is set, then serializes
|
|
622
|
-
and writes to a file determined by the key and
|
|
621
|
+
and writes to a file determined by the key and serialization_format.
|
|
623
622
|
|
|
624
623
|
Args:
|
|
625
|
-
key (
|
|
624
|
+
key (NonEmptyPersiDictKey): Key (string or sequence of strings
|
|
625
|
+
or NonEmptySafeStrTuple).
|
|
626
626
|
value (Any): Value to store, or a joker command.
|
|
627
627
|
|
|
628
628
|
Raises:
|
|
629
629
|
KeyError: If attempting to modify an existing item when
|
|
630
|
-
|
|
630
|
+
append_only is True.
|
|
631
631
|
TypeError: If the value is a PersiDict or does not match
|
|
632
632
|
base_class_for_values when it is set.
|
|
633
633
|
"""
|
|
634
634
|
|
|
635
|
-
key =
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
# processed by base class
|
|
639
|
-
return
|
|
635
|
+
key = NonEmptySafeStrTuple(key)
|
|
636
|
+
if self._process_setitem_args(key, value) is EXECUTION_IS_COMPLETE:
|
|
637
|
+
return None
|
|
640
638
|
|
|
641
639
|
filename = self._build_full_path(key, create_subdirs=True)
|
|
642
640
|
self._save_to_file(filename, value)
|
|
643
641
|
|
|
644
642
|
|
|
645
|
-
def __delitem__(self, key:
|
|
643
|
+
def __delitem__(self, key:NonEmptyPersiDictKey) -> None:
|
|
646
644
|
"""Delete the stored value for a key.
|
|
647
645
|
|
|
648
646
|
Args:
|
|
649
|
-
key (
|
|
647
|
+
key (NonEmptyPersiDictKey): Key (string or sequence of strings
|
|
648
|
+
or NonEmptySafeStrTuple).
|
|
650
649
|
|
|
651
650
|
Raises:
|
|
652
|
-
KeyError: If
|
|
651
|
+
KeyError: If append_only is True or if the key does not exist.
|
|
653
652
|
"""
|
|
654
|
-
key =
|
|
653
|
+
key = NonEmptySafeStrTuple(key)
|
|
654
|
+
self._process_delitem_args(key)
|
|
655
655
|
filename = self._build_full_path(key)
|
|
656
656
|
if not os.path.isfile(filename):
|
|
657
657
|
raise KeyError(f"File {filename} does not exist")
|
|
@@ -681,9 +681,9 @@ class FileDirDict(PersiDict):
|
|
|
681
681
|
ValueError: If result_type is empty or contains unsupported labels.
|
|
682
682
|
"""
|
|
683
683
|
|
|
684
|
-
|
|
684
|
+
self._process_generic_iter_args(result_type)
|
|
685
685
|
walk_results = os.walk(self._base_dir)
|
|
686
|
-
ext_len = len(self.
|
|
686
|
+
ext_len = len(self.serialization_format) + 1
|
|
687
687
|
|
|
688
688
|
def splitter(dir_path: str):
|
|
689
689
|
"""Transform a relative dirname into SafeStrTuple components.
|
|
@@ -700,7 +700,7 @@ class FileDirDict(PersiDict):
|
|
|
700
700
|
|
|
701
701
|
def step():
|
|
702
702
|
"""Generator that yields entries based on result_type."""
|
|
703
|
-
suffix = "." + self.
|
|
703
|
+
suffix = "." + self.serialization_format
|
|
704
704
|
for dir_name, _, files in walk_results:
|
|
705
705
|
for f in files:
|
|
706
706
|
if f.endswith(suffix):
|
|
@@ -714,13 +714,21 @@ class FileDirDict(PersiDict):
|
|
|
714
714
|
to_return = []
|
|
715
715
|
|
|
716
716
|
if "keys" in result_type:
|
|
717
|
-
key_to_return= unsign_safe_str_tuple(
|
|
717
|
+
key_to_return = unsign_safe_str_tuple(
|
|
718
718
|
result_key, self.digest_len)
|
|
719
719
|
to_return.append(key_to_return)
|
|
720
720
|
|
|
721
721
|
if "values" in result_type:
|
|
722
|
+
# The file can be deleted between listing and fetching.
|
|
723
|
+
# Skip such races instead of raising to make iteration robust.
|
|
722
724
|
full_path = os.path.join(dir_name, f)
|
|
723
|
-
|
|
725
|
+
try:
|
|
726
|
+
value_to_return = self._read_from_file(full_path)
|
|
727
|
+
except:
|
|
728
|
+
if not os.path.isfile(full_path):
|
|
729
|
+
continue
|
|
730
|
+
else:
|
|
731
|
+
raise
|
|
724
732
|
to_return.append(value_to_return)
|
|
725
733
|
|
|
726
734
|
if len(result_type) == 1:
|
|
@@ -735,13 +743,13 @@ class FileDirDict(PersiDict):
|
|
|
735
743
|
return step()
|
|
736
744
|
|
|
737
745
|
|
|
738
|
-
def timestamp(self, key:
|
|
746
|
+
def timestamp(self, key:NonEmptyPersiDictKey) -> float:
|
|
739
747
|
"""Get last modification time (in seconds, Unix epoch time).
|
|
740
748
|
|
|
741
749
|
This method is absent in the original dict API.
|
|
742
750
|
|
|
743
751
|
Args:
|
|
744
|
-
key (
|
|
752
|
+
key (NonEmptyPersiDictKey): Key whose timestamp to return.
|
|
745
753
|
|
|
746
754
|
Returns:
|
|
747
755
|
float: POSIX timestamp of the underlying file.
|
|
@@ -749,25 +757,25 @@ class FileDirDict(PersiDict):
|
|
|
749
757
|
Raises:
|
|
750
758
|
FileNotFoundError: If the key does not exist.
|
|
751
759
|
"""
|
|
752
|
-
key =
|
|
760
|
+
key = NonEmptySafeStrTuple(key)
|
|
753
761
|
filename = self._build_full_path(key)
|
|
754
762
|
return os.path.getmtime(filename)
|
|
755
763
|
|
|
756
764
|
|
|
757
|
-
def random_key(self) ->
|
|
765
|
+
def random_key(self) -> NonEmptySafeStrTuple | None:
|
|
758
766
|
"""Return a uniformly random key from the dictionary, or None if empty.
|
|
759
767
|
|
|
760
768
|
Performs a full directory traversal using reservoir sampling
|
|
761
|
-
(k=1) to select a random file matching the configured
|
|
769
|
+
(k=1) to select a random file matching the configured serialization_format without
|
|
762
770
|
loading all keys into memory.
|
|
763
771
|
|
|
764
772
|
Returns:
|
|
765
|
-
|
|
773
|
+
NonEmptySafeStrTuple | None: A random key if any items exist; otherwise None.
|
|
766
774
|
"""
|
|
767
775
|
# canonicalise extension once
|
|
768
776
|
ext = None
|
|
769
|
-
if self.
|
|
770
|
-
ext = self.
|
|
777
|
+
if self.serialization_format:
|
|
778
|
+
ext = self.serialization_format
|
|
771
779
|
if not ext.startswith("."):
|
|
772
780
|
ext = "." + ext
|
|
773
781
|
|