persidict 0.32.7__tar.gz → 0.34.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of persidict might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: persidict
3
- Version: 0.32.7
3
+ Version: 0.34.1
4
4
  Summary: Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud.
5
5
  Keywords: persistence,dicts,distributed,parallel
6
6
  Author: Vlad (Volodymyr) Pavlov
@@ -21,7 +21,6 @@ Requires-Dist: joblib
21
21
  Requires-Dist: numpy
22
22
  Requires-Dist: pandas
23
23
  Requires-Dist: jsonpickle
24
- Requires-Dist: joblib
25
24
  Requires-Dist: deepdiff
26
25
  Requires-Dist: boto3 ; extra == 'aws'
27
26
  Requires-Dist: boto3 ; extra == 'dev'
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "persidict"
7
- version = "0.32.7"
7
+ version = "0.34.1"
8
8
  description = "Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -31,7 +31,6 @@ dependencies = [
31
31
  "numpy",
32
32
  "pandas",
33
33
  "jsonpickle",
34
- "joblib",
35
34
  "deepdiff"
36
35
  ]
37
36
 
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
 
12
12
  import os
13
13
  import random
14
+ import tempfile
14
15
  import time
15
16
  from typing import Any, Optional
16
17
 
@@ -79,6 +80,10 @@ class FileDirDict(PersiDict):
79
80
 
80
81
  assert file_type == replace_unsafe_chars(file_type, "")
81
82
  self.file_type = file_type
83
+ if self.file_type == "__etag__":
84
+ raise ValueError(
85
+ "file_type cannot be 'etag' as it is a reserved"
86
+ " extension for S3 caching.")
82
87
 
83
88
  if (base_class_for_values is None or
84
89
  not issubclass(base_class_for_values,str)):
@@ -90,13 +95,7 @@ class FileDirDict(PersiDict):
90
95
  if os.path.isfile(base_dir):
91
96
  raise ValueError(f"{base_dir} is a file, not a directory.")
92
97
 
93
- try: # extra protection to better handle concurrent access
94
- if not os.path.isdir(base_dir):
95
- os.mkdir(base_dir)
96
- except:
97
- time.sleep(random.random()/random.randint(1, 3))
98
- if not os.path.isdir(base_dir):
99
- os.mkdir(base_dir)
98
+ os.makedirs(base_dir, exist_ok=True)
100
99
  assert os.path.isdir(base_dir)
101
100
 
102
101
  # self.base_dir_param = _base_dir
@@ -137,25 +136,16 @@ class FileDirDict(PersiDict):
137
136
 
138
137
 
139
138
  def __len__(self) -> int:
140
- """ Get the number of key-value pairs in the dictionary."""
141
-
142
- num_files = 0
143
- suffix = "." + self.file_type
144
- stack = [self._base_dir]
139
+ """ Get the number of key-value pairs in the dictionary.
145
140
 
146
- while stack:
147
- path = stack.pop()
148
- try:
149
- with os.scandir(path) as it:
150
- for entry in it:
151
- if entry.is_dir(follow_symlinks=False):
152
- stack.append(entry.path)
153
- elif entry.is_file(follow_symlinks=False) and entry.name.endswith(suffix):
154
- num_files += 1
155
- except PermissionError:
156
- continue
141
+ WARNING: This operation can be slow on large dictionaries as it
142
+ needs to recursively walk the entire base directory.
143
+ Avoid using it in performance-sensitive code.
144
+ """
157
145
 
158
- return num_files
146
+ suffix = "." + self.file_type
147
+ return sum(1 for _, _, files in os.walk(self._base_dir)
148
+ for f in files if f.endswith(suffix))
159
149
 
160
150
 
161
151
  def clear(self) -> None:
@@ -164,6 +154,9 @@ class FileDirDict(PersiDict):
164
154
  if self.immutable_items:
165
155
  raise KeyError("Can't clear a dict that contains immutable items")
166
156
 
157
+ # we can't use shutil.rmtree() because
158
+ # there may be overlapping dictionaries
159
+ # with different file_type-s
167
160
  for subdir_info in os.walk(self._base_dir, topdown=False):
168
161
  (subdir_name, _, files) = subdir_info
169
162
  suffix = "." + self.file_type
@@ -174,6 +167,7 @@ class FileDirDict(PersiDict):
174
167
  len(os.listdir(subdir_name)) == 0 ):
175
168
  os.rmdir(subdir_name)
176
169
 
170
+
177
171
  def _build_full_path(self
178
172
  , key:SafeStrTuple
179
173
  , create_subdirs:bool=False
@@ -185,17 +179,8 @@ class FileDirDict(PersiDict):
185
179
  dir_names = key[:-1] if is_file_path else key
186
180
 
187
181
  if create_subdirs:
188
- current_dir = dir_names[0]
189
- for dir_name in dir_names[1:]:
190
- new_dir = os.path.join(current_dir, dir_name)
191
- try: # extra protection to better handle concurrent access
192
- if not os.path.isdir(new_dir):
193
- os.mkdir(new_dir)
194
- except:
195
- time.sleep(random.random()/random.randint(1, 3))
196
- if not os.path.isdir(new_dir):
197
- os.mkdir(new_dir)
198
- current_dir = new_dir
182
+ dir_path = os.path.join(*dir_names)
183
+ os.makedirs(dir_path, exist_ok=True)
199
184
 
200
185
  if is_file_path:
201
186
  file_name = key[-1] + "." + self.file_type
@@ -282,25 +267,50 @@ class FileDirDict(PersiDict):
282
267
  for i in range(n_retries):
283
268
  try:
284
269
  return self._read_from_file_impl(file_name)
285
- except:
286
- time.sleep(random.random()/random.randint(1, 10))
287
-
288
- return self._read_from_file_impl(file_name)
270
+ except Exception as e:
271
+ if i < n_retries - 1:
272
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
273
+ else:
274
+ raise e
289
275
 
290
276
 
291
277
  def _save_to_file_impl(self, file_name:str, value:Any) -> None:
292
278
  """Save a value to a file. """
293
279
 
294
- if self.file_type == "pkl":
295
- with open(file_name, 'wb') as f:
296
- joblib.dump(value, f, compress='lz4')
297
- elif self.file_type == "json":
298
- with open(file_name, 'w') as f:
299
- f.write(jsonpickle.dumps(value, indent=4))
300
- else:
301
- with open(file_name, 'w') as f:
302
- f.write(value)
280
+ dir_name = os.path.dirname(file_name)
281
+ # Use a temporary file and atomic rename to prevent data corruption
282
+ fd, temp_path = tempfile.mkstemp(dir=dir_name, prefix=".__tmp__")
283
+
284
+ try:
285
+ if self.file_type == "pkl":
286
+ with open(fd, 'wb') as f:
287
+ joblib.dump(value, f, compress='lz4')
288
+ f.flush()
289
+ os.fsync(f.fileno())
290
+ elif self.file_type == "json":
291
+ with open(fd, 'w') as f:
292
+ f.write(jsonpickle.dumps(value, indent=4))
293
+ f.flush()
294
+ os.fsync(f.fileno())
295
+ else:
296
+ with open(fd, 'w') as f:
297
+ f.write(value)
298
+ f.flush()
299
+ os.fsync(f.fileno())
300
+ os.replace(temp_path, file_name)
301
+ try:
302
+ if os.name == 'posix':
303
+ dir_fd = os.open(dir_name, os.O_RDONLY)
304
+ try:
305
+ os.fsync(dir_fd)
306
+ finally:
307
+ os.close(dir_fd)
308
+ except OSError:
309
+ pass
303
310
 
311
+ except:
312
+ os.remove(temp_path)
313
+ raise
304
314
 
305
315
  def _save_to_file(self, file_name:str, value:Any) -> None:
306
316
  """Save a value to a file. """
@@ -310,16 +320,17 @@ class FileDirDict(PersiDict):
310
320
  raise ValueError("When base_class_for_values is not str,"
311
321
  + " file_type must be pkl or json.")
312
322
 
313
- n_retries = 3
323
+ n_retries = 8
314
324
  # extra protections to better handle concurrent writes
315
325
  for i in range(n_retries):
316
- try: # extra protections to better handle concurrent writes
326
+ try:
317
327
  self._save_to_file_impl(file_name, value)
318
328
  return
319
- except:
320
- time.sleep(random.random()/random.randint(1, 5))
321
-
322
- self._save_to_file_impl(file_name, value)
329
+ except Exception as e:
330
+ if i < n_retries - 1:
331
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
332
+ else:
333
+ raise e
323
334
 
324
335
 
325
336
  def __contains__(self, key:PersiDictKey) -> bool:
@@ -394,16 +405,9 @@ class FileDirDict(PersiDict):
394
405
 
395
406
  def splitter(dir_path: str):
396
407
  """Transform a dirname into a PersiDictKey key"""
397
- splitted_str = []
398
408
  if dir_path == ".":
399
- return splitted_str
400
- while True:
401
- head, tail = os.path.split(dir_path)
402
- splitted_str = [tail] + splitted_str
403
- dir_path = head
404
- if len(head) == 0:
405
- break
406
- return tuple(splitted_str)
409
+ return []
410
+ return dir_path.split(os.sep)
407
411
 
408
412
  def step():
409
413
  suffix = "." + self.file_type
@@ -424,7 +428,8 @@ class FileDirDict(PersiDict):
424
428
  to_return.append(key_to_return)
425
429
 
426
430
  if "values" in result_type:
427
- value_to_return = self[result_key]
431
+ full_path = os.path.join(dir_name, f)
432
+ value_to_return = self._read_from_file(full_path)
428
433
  to_return.append(value_to_return)
429
434
 
430
435
  if len(result_type) == 1:
@@ -451,7 +456,6 @@ class FileDirDict(PersiDict):
451
456
 
452
457
  def random_key(self) -> PersiDictKey | None:
453
458
  # canonicalise extension once
454
- early_exit_cap = 10_000
455
459
  ext = None
456
460
  if self.file_type:
457
461
  ext = self.file_type.lower()
@@ -479,9 +483,6 @@ class FileDirDict(PersiDict):
479
483
  seen += 1
480
484
  if random.random() < 1 / seen: # reservoir k=1
481
485
  winner = ent.path
482
- # early‑exit when cap reached
483
- if early_exit_cap and seen >= early_exit_cap:
484
- return self._build_key_from_full_path(os.path.abspath(winner))
485
486
  except PermissionError:
486
487
  continue
487
488
 
@@ -1,9 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import tempfile
4
5
  from typing import Any, Optional
5
6
 
6
7
  import boto3
8
+ from botocore.exceptions import ClientError
9
+
7
10
  import parameterizable
8
11
  from parameterizable.dict_sorter import sort_dict_by_keys
9
12
 
@@ -60,16 +63,19 @@ class S3Dict(PersiDict):
60
63
  check types of values in the dictionary. If not specified,
61
64
  no type checking will be performed and all types will be allowed.
62
65
 
63
- file_type is extension, which will be used for all files in the dictionary.
66
+ file_type is an extension, which will be used for all files in the dictionary.
64
67
  If file_type has one of two values: "lz4" or "json", it defines
65
68
  which file format will be used by FileDirDict to store values.
66
69
  For all other values of file_type, the file format will always be plain
67
- text. "lz4" or "json" allow to store arbitrary Python objects,
70
+ text. "lz4" or "json" allow storing arbitrary Python objects,
68
71
  while all other file_type-s only work with str objects.
69
72
  """
70
73
 
71
74
  super().__init__(immutable_items = immutable_items, digest_len = 0)
72
75
  self.file_type = file_type
76
+ if self.file_type == "__etag__":
77
+ raise ValueError(
78
+ "file_type cannot be 'etag' as it is a reserved extension for caching.")
73
79
 
74
80
  self.local_cache = FileDirDict(
75
81
  base_dir= base_dir
@@ -152,26 +158,89 @@ class S3Dict(PersiDict):
152
158
  return False
153
159
 
154
160
 
161
+ def _write_etag_file(self, file_name: str, etag: str):
162
+ """Atomically write the ETag to its cache file."""
163
+ if not etag:
164
+ return
165
+ etag_file_name = file_name + ".__etag__"
166
+ dir_name = os.path.dirname(etag_file_name)
167
+ # Write to a temporary file and then rename for atomicity
168
+ fd, temp_path = tempfile.mkstemp(dir=dir_name)
169
+ try:
170
+ with os.fdopen(fd, "w") as f:
171
+ f.write(etag)
172
+ f.flush()
173
+ os.fsync(f.fileno())
174
+ os.replace(temp_path, etag_file_name)
175
+ try:
176
+ if os.name == 'posix':
177
+ dir_fd = os.open(dir_name, os.O_RDONLY)
178
+ try:
179
+ os.fsync(dir_fd)
180
+ finally:
181
+ os.close(dir_fd)
182
+ except OSError:
183
+ pass
184
+ except:
185
+ os.remove(temp_path)
186
+ raise
187
+
188
+
155
189
  def __getitem__(self, key:PersiDictKey) -> Any:
156
190
  """X.__getitem__(y) is an equivalent to X[y]. """
157
191
 
158
192
  key = SafeStrTuple(key)
159
193
  file_name = self.local_cache._build_full_path(key, create_subdirs=True)
160
194
 
161
- if self.immutable_items:
195
+ if self.immutable_items and os.path.exists(file_name):
196
+ return self.local_cache._read_from_file(file_name)
197
+
198
+ obj_name = self._build_full_objectname(key)
199
+
200
+
201
+ try:
202
+ head = self.s3_client.head_object(
203
+ Bucket=self.bucket_name, Key=obj_name)
204
+ s3_etag = head.get("ETag")
205
+ except ClientError as e:
206
+ if e.response['Error']['Code'] == '404':
207
+ raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
208
+ else:
209
+ # Re-raise other client errors (e.g., permissions, throttling)
210
+ raise
211
+
212
+ etag_file_name = file_name + ".__etag__"
213
+ if not self.immutable_items and os.path.exists(file_name) and os.path.exists(etag_file_name):
214
+ with open(etag_file_name, "r") as f:
215
+ cached_etag = f.read()
216
+ if cached_etag == s3_etag:
217
+ return self.local_cache._read_from_file(file_name)
218
+
219
+ dir_name = os.path.dirname(file_name)
220
+ fd, temp_path = tempfile.mkstemp(dir=dir_name, prefix=".__tmp__")
221
+
222
+ try:
223
+ with os.fdopen(fd, 'wb') as f:
224
+ self.s3_client.download_fileobj(self.bucket_name, obj_name, f)
225
+ f.flush()
226
+ os.fsync(f.fileno())
227
+ os.replace(temp_path, file_name)
162
228
  try:
163
- result = self.local_cache._read_from_file(file_name)
164
- return result
165
- except:
229
+ if os.name == 'posix':
230
+ dir_fd = os.open(dir_name, os.O_RDONLY)
231
+ try:
232
+ os.fsync(dir_fd)
233
+ finally:
234
+ os.close(dir_fd)
235
+ except OSError:
166
236
  pass
237
+ except:
238
+ os.remove(temp_path) # Clean up temp file on failure
239
+ raise
167
240
 
168
- obj_name = self._build_full_objectname(key)
169
- self.s3_client.download_file(self.bucket_name, obj_name, file_name)
170
- result = self.local_cache._read_from_file(file_name)
171
- if not self.immutable_items:
172
- os.remove(file_name)
241
+ self._write_etag_file(file_name, s3_etag)
173
242
 
174
- return result
243
+ return self.local_cache._read_from_file(file_name)
175
244
 
176
245
 
177
246
  def __setitem__(self, key:PersiDictKey, value:Any):
@@ -196,28 +265,27 @@ class S3Dict(PersiDict):
196
265
  + f"but it is {type(value)} instead." )
197
266
 
198
267
  key = SafeStrTuple(key)
199
- file_name = self.local_cache._build_full_path(key, create_subdirs=True)
200
- obj_name = self._build_full_objectname(key)
201
268
 
202
- if self.immutable_items:
203
- key_is_present = False
204
- if os.path.exists(file_name):
205
- key_is_present = True
206
- else:
207
- try:
208
- self.s3_client.head_object(
209
- Bucket=self.bucket_name, Key=obj_name)
210
- key_is_present = True
211
- except:
212
- key_is_present = False
269
+ if self.immutable_items and key in self:
270
+ raise KeyError("Can't modify an immutable item")
213
271
 
214
- if key_is_present:
215
- raise KeyError("Can't modify an immutable item")
272
+ file_name = self.local_cache._build_full_path(key, create_subdirs=True)
273
+ obj_name = self._build_full_objectname(key)
216
274
 
217
275
  self.local_cache._save_to_file(file_name, value)
218
276
  self.s3_client.upload_file(file_name, self.bucket_name, obj_name)
219
- if not self.immutable_items:
220
- os.remove(file_name)
277
+
278
+ try:
279
+ head = self.s3_client.head_object(
280
+ Bucket=self.bucket_name, Key=obj_name)
281
+ s3_etag = head.get("ETag")
282
+ self._write_etag_file(file_name, s3_etag)
283
+ except ClientError:
284
+ # If we can't get ETag, we should remove any existing etag file
285
+ # to force a re-download on the next __getitem__ call.
286
+ etag_file_name = file_name + ".__etag__"
287
+ if os.path.exists(etag_file_name):
288
+ os.remove(etag_file_name)
221
289
 
222
290
 
223
291
  def __delitem__(self, key:PersiDictKey):
@@ -231,10 +299,17 @@ class S3Dict(PersiDict):
231
299
  file_name = self.local_cache._build_full_path(key)
232
300
  if os.path.isfile(file_name):
233
301
  os.remove(file_name)
234
-
302
+ etag_file_name = file_name + ".__etag__"
303
+ if os.path.isfile(etag_file_name):
304
+ os.remove(etag_file_name)
235
305
 
236
306
  def __len__(self) -> int:
237
- """Return len(self). """
307
+ """Return len(self).
308
+
309
+ WARNING: This operation can be very slow and costly on large S3 buckets
310
+ as it needs to iterate over all objects in the dictionary's prefix.
311
+ Avoid using it in performance-sensitive code.
312
+ """
238
313
 
239
314
  num_files = 0
240
315
  suffix = "." + self.file_type
@@ -1,12 +1,11 @@
1
1
  import string
2
- from copy import deepcopy
3
2
 
4
3
  SAFE_CHARS_SET = set(string.ascii_letters + string.digits + "()_-~.=")
5
4
  SAFE_STRING_MAX_LENGTH = 254
6
5
 
7
6
  def get_safe_chars() -> set[str]:
8
7
  """Return a set of allowed characters."""
9
- return deepcopy(SAFE_CHARS_SET)
8
+ return SAFE_CHARS_SET.copy()
10
9
 
11
10
  def replace_unsafe_chars(a_str:str, replace_with:str) -> str :
12
11
  """ Replace unsafe (special) characters with allowed (safe) ones."""
@@ -43,7 +43,7 @@ class SafeStrTuple(Sequence, Hashable):
43
43
  elif isinstance(a, str):
44
44
  assert len(a) > 0
45
45
  assert len(a) < SAFE_STRING_MAX_LENGTH
46
- assert len(set(a) - SAFE_CHARS_SET) == 0
46
+ assert all(c in SAFE_CHARS_SET for c in a)
47
47
  candidate_strings.append(a)
48
48
  elif _is_sequence_not_mapping(a):
49
49
  if len(a) > 0:
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import time
4
+ from functools import cache
4
5
 
5
6
  from deepdiff import DeepDiff
6
7
  from parameterizable import register_parameterizable_class, sort_dict_by_keys
@@ -115,17 +116,19 @@ class WriteOnceDict(PersiDict):
115
116
  """
116
117
  check_needed = False
117
118
 
118
- try: # extra protections to better handle concurrent writes
119
- if key in self._wrapped_dict:
120
- check_needed = True
121
- else:
122
- self._wrapped_dict[key] = value
123
- except:
124
- time.sleep(random.random()/random.randint(1,5))
125
- if key in self._wrapped_dict:
126
- check_needed = True
127
- else:
128
- self._wrapped_dict[key] = value
119
+ n_retries = 8
120
+ for i in range(n_retries):
121
+ try: # extra protections to better handle concurrent writes
122
+ if key in self._wrapped_dict:
123
+ check_needed = True
124
+ else:
125
+ self._wrapped_dict[key] = value
126
+ break
127
+ except Exception as e:
128
+ if i < n_retries - 1:
129
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
130
+ else:
131
+ raise e
129
132
 
130
133
  if not key in self._wrapped_dict:
131
134
  raise KeyError(
File without changes