persidict 0.32.8__py3-none-any.whl → 0.34.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of persidict might be problematic. Click here for more details.

@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
 
12
12
  import os
13
13
  import random
14
+ import tempfile
14
15
  import time
15
16
  from typing import Any, Optional
16
17
 
@@ -79,6 +80,10 @@ class FileDirDict(PersiDict):
79
80
 
80
81
  assert file_type == replace_unsafe_chars(file_type, "")
81
82
  self.file_type = file_type
83
+ if self.file_type == "__etag__":
84
+ raise ValueError(
85
+ "file_type cannot be 'etag' as it is a reserved"
86
+ " extension for S3 caching.")
82
87
 
83
88
  if (base_class_for_values is None or
84
89
  not issubclass(base_class_for_values,str)):
@@ -90,13 +95,7 @@ class FileDirDict(PersiDict):
90
95
  if os.path.isfile(base_dir):
91
96
  raise ValueError(f"{base_dir} is a file, not a directory.")
92
97
 
93
- try: # extra protection to better handle concurrent access
94
- if not os.path.isdir(base_dir):
95
- os.mkdir(base_dir)
96
- except:
97
- time.sleep(random.random()/random.randint(1, 3))
98
- if not os.path.isdir(base_dir):
99
- os.mkdir(base_dir)
98
+ os.makedirs(base_dir, exist_ok=True)
100
99
  assert os.path.isdir(base_dir)
101
100
 
102
101
  # self.base_dir_param = _base_dir
@@ -137,7 +136,12 @@ class FileDirDict(PersiDict):
137
136
 
138
137
 
139
138
  def __len__(self) -> int:
140
- """ Get the number of key-value pairs in the dictionary."""
139
+ """ Get the number of key-value pairs in the dictionary.
140
+
141
+ WARNING: This operation can be slow on large dictionaries as it
142
+ needs to recursively walk the entire base directory.
143
+ Avoid using it in performance-sensitive code.
144
+ """
141
145
 
142
146
  suffix = "." + self.file_type
143
147
  return sum(1 for _, _, files in os.walk(self._base_dir)
@@ -150,6 +154,9 @@ class FileDirDict(PersiDict):
150
154
  if self.immutable_items:
151
155
  raise KeyError("Can't clear a dict that contains immutable items")
152
156
 
157
+ # we can't use shutil.rmtree() because
158
+ # there may be overlapping dictionaries
159
+ # with different file_type-s
153
160
  for subdir_info in os.walk(self._base_dir, topdown=False):
154
161
  (subdir_name, _, files) = subdir_info
155
162
  suffix = "." + self.file_type
@@ -172,17 +179,8 @@ class FileDirDict(PersiDict):
172
179
  dir_names = key[:-1] if is_file_path else key
173
180
 
174
181
  if create_subdirs:
175
- current_dir = dir_names[0]
176
- for dir_name in dir_names[1:]:
177
- new_dir = os.path.join(current_dir, dir_name)
178
- try: # extra protection to better handle concurrent access
179
- if not os.path.isdir(new_dir):
180
- os.mkdir(new_dir)
181
- except:
182
- time.sleep(random.random()/random.randint(1, 3))
183
- if not os.path.isdir(new_dir):
184
- os.mkdir(new_dir)
185
- current_dir = new_dir
182
+ dir_path = os.path.join(*dir_names)
183
+ os.makedirs(dir_path, exist_ok=True)
186
184
 
187
185
  if is_file_path:
188
186
  file_name = key[-1] + "." + self.file_type
@@ -269,25 +267,50 @@ class FileDirDict(PersiDict):
269
267
  for i in range(n_retries):
270
268
  try:
271
269
  return self._read_from_file_impl(file_name)
272
- except:
273
- time.sleep(random.random()/random.randint(1, 10))
274
-
275
- return self._read_from_file_impl(file_name)
270
+ except Exception as e:
271
+ if i < n_retries - 1:
272
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
273
+ else:
274
+ raise e
276
275
 
277
276
 
278
277
  def _save_to_file_impl(self, file_name:str, value:Any) -> None:
279
278
  """Save a value to a file. """
280
279
 
281
- if self.file_type == "pkl":
282
- with open(file_name, 'wb') as f:
283
- joblib.dump(value, f, compress='lz4')
284
- elif self.file_type == "json":
285
- with open(file_name, 'w') as f:
286
- f.write(jsonpickle.dumps(value, indent=4))
287
- else:
288
- with open(file_name, 'w') as f:
289
- f.write(value)
280
+ dir_name = os.path.dirname(file_name)
281
+ # Use a temporary file and atomic rename to prevent data corruption
282
+ fd, temp_path = tempfile.mkstemp(dir=dir_name, prefix=".__tmp__")
283
+
284
+ try:
285
+ if self.file_type == "pkl":
286
+ with open(fd, 'wb') as f:
287
+ joblib.dump(value, f, compress='lz4')
288
+ f.flush()
289
+ os.fsync(f.fileno())
290
+ elif self.file_type == "json":
291
+ with open(fd, 'w') as f:
292
+ f.write(jsonpickle.dumps(value, indent=4))
293
+ f.flush()
294
+ os.fsync(f.fileno())
295
+ else:
296
+ with open(fd, 'w') as f:
297
+ f.write(value)
298
+ f.flush()
299
+ os.fsync(f.fileno())
300
+ os.replace(temp_path, file_name)
301
+ try:
302
+ if os.name == 'posix':
303
+ dir_fd = os.open(dir_name, os.O_RDONLY)
304
+ try:
305
+ os.fsync(dir_fd)
306
+ finally:
307
+ os.close(dir_fd)
308
+ except OSError:
309
+ pass
290
310
 
311
+ except:
312
+ os.remove(temp_path)
313
+ raise
291
314
 
292
315
  def _save_to_file(self, file_name:str, value:Any) -> None:
293
316
  """Save a value to a file. """
@@ -297,16 +320,17 @@ class FileDirDict(PersiDict):
297
320
  raise ValueError("When base_class_for_values is not str,"
298
321
  + " file_type must be pkl or json.")
299
322
 
300
- n_retries = 3
323
+ n_retries = 8
301
324
  # extra protections to better handle concurrent writes
302
325
  for i in range(n_retries):
303
- try: # extra protections to better handle concurrent writes
326
+ try:
304
327
  self._save_to_file_impl(file_name, value)
305
328
  return
306
- except:
307
- time.sleep(random.random()/random.randint(1, 5))
308
-
309
- self._save_to_file_impl(file_name, value)
329
+ except Exception as e:
330
+ if i < n_retries - 1:
331
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
332
+ else:
333
+ raise e
310
334
 
311
335
 
312
336
  def __contains__(self, key:PersiDictKey) -> bool:
@@ -381,16 +405,9 @@ class FileDirDict(PersiDict):
381
405
 
382
406
  def splitter(dir_path: str):
383
407
  """Transform a dirname into a PersiDictKey key"""
384
- splitted_str = []
385
408
  if dir_path == ".":
386
- return splitted_str
387
- while True:
388
- head, tail = os.path.split(dir_path)
389
- splitted_str = [tail] + splitted_str
390
- dir_path = head
391
- if len(head) == 0:
392
- break
393
- return tuple(splitted_str)
409
+ return []
410
+ return dir_path.split(os.sep)
394
411
 
395
412
  def step():
396
413
  suffix = "." + self.file_type
@@ -439,7 +456,6 @@ class FileDirDict(PersiDict):
439
456
 
440
457
  def random_key(self) -> PersiDictKey | None:
441
458
  # canonicalise extension once
442
- early_exit_cap = 10_000
443
459
  ext = None
444
460
  if self.file_type:
445
461
  ext = self.file_type.lower()
@@ -467,9 +483,6 @@ class FileDirDict(PersiDict):
467
483
  seen += 1
468
484
  if random.random() < 1 / seen: # reservoir k=1
469
485
  winner = ent.path
470
- # early‑exit when cap reached
471
- if early_exit_cap and seen >= early_exit_cap:
472
- return self._build_key_from_full_path(os.path.abspath(winner))
473
486
  except PermissionError:
474
487
  continue
475
488
 
persidict/s3_dict.py CHANGED
@@ -1,9 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import tempfile
4
5
  from typing import Any, Optional
5
6
 
6
7
  import boto3
8
+ from botocore.exceptions import ClientError
9
+
7
10
  import parameterizable
8
11
  from parameterizable.dict_sorter import sort_dict_by_keys
9
12
 
@@ -60,16 +63,19 @@ class S3Dict(PersiDict):
60
63
  check types of values in the dictionary. If not specified,
61
64
  no type checking will be performed and all types will be allowed.
62
65
 
63
- file_type is extension, which will be used for all files in the dictionary.
66
+ file_type is an extension, which will be used for all files in the dictionary.
64
67
  If file_type has one of two values: "lz4" or "json", it defines
65
68
  which file format will be used by FileDirDict to store values.
66
69
  For all other values of file_type, the file format will always be plain
67
- text. "lz4" or "json" allow to store arbitrary Python objects,
70
+ text. "lz4" or "json" allow storing arbitrary Python objects,
68
71
  while all other file_type-s only work with str objects.
69
72
  """
70
73
 
71
74
  super().__init__(immutable_items = immutable_items, digest_len = 0)
72
75
  self.file_type = file_type
76
+ if self.file_type == "__etag__":
77
+ raise ValueError(
78
+ "file_type cannot be 'etag' as it is a reserved extension for caching.")
73
79
 
74
80
  self.local_cache = FileDirDict(
75
81
  base_dir= base_dir
@@ -152,26 +158,89 @@ class S3Dict(PersiDict):
152
158
  return False
153
159
 
154
160
 
161
+ def _write_etag_file(self, file_name: str, etag: str):
162
+ """Atomically write the ETag to its cache file."""
163
+ if not etag:
164
+ return
165
+ etag_file_name = file_name + ".__etag__"
166
+ dir_name = os.path.dirname(etag_file_name)
167
+ # Write to a temporary file and then rename for atomicity
168
+ fd, temp_path = tempfile.mkstemp(dir=dir_name)
169
+ try:
170
+ with os.fdopen(fd, "w") as f:
171
+ f.write(etag)
172
+ f.flush()
173
+ os.fsync(f.fileno())
174
+ os.replace(temp_path, etag_file_name)
175
+ try:
176
+ if os.name == 'posix':
177
+ dir_fd = os.open(dir_name, os.O_RDONLY)
178
+ try:
179
+ os.fsync(dir_fd)
180
+ finally:
181
+ os.close(dir_fd)
182
+ except OSError:
183
+ pass
184
+ except:
185
+ os.remove(temp_path)
186
+ raise
187
+
188
+
155
189
  def __getitem__(self, key:PersiDictKey) -> Any:
156
190
  """X.__getitem__(y) is an equivalent to X[y]. """
157
191
 
158
192
  key = SafeStrTuple(key)
159
193
  file_name = self.local_cache._build_full_path(key, create_subdirs=True)
160
194
 
161
- if self.immutable_items:
195
+ if self.immutable_items and os.path.exists(file_name):
196
+ return self.local_cache._read_from_file(file_name)
197
+
198
+ obj_name = self._build_full_objectname(key)
199
+
200
+
201
+ try:
202
+ head = self.s3_client.head_object(
203
+ Bucket=self.bucket_name, Key=obj_name)
204
+ s3_etag = head.get("ETag")
205
+ except ClientError as e:
206
+ if e.response['Error']['Code'] == '404':
207
+ raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
208
+ else:
209
+ # Re-raise other client errors (e.g., permissions, throttling)
210
+ raise
211
+
212
+ etag_file_name = file_name + ".__etag__"
213
+ if not self.immutable_items and os.path.exists(file_name) and os.path.exists(etag_file_name):
214
+ with open(etag_file_name, "r") as f:
215
+ cached_etag = f.read()
216
+ if cached_etag == s3_etag:
217
+ return self.local_cache._read_from_file(file_name)
218
+
219
+ dir_name = os.path.dirname(file_name)
220
+ fd, temp_path = tempfile.mkstemp(dir=dir_name, prefix=".__tmp__")
221
+
222
+ try:
223
+ with os.fdopen(fd, 'wb') as f:
224
+ self.s3_client.download_fileobj(self.bucket_name, obj_name, f)
225
+ f.flush()
226
+ os.fsync(f.fileno())
227
+ os.replace(temp_path, file_name)
162
228
  try:
163
- result = self.local_cache._read_from_file(file_name)
164
- return result
165
- except:
229
+ if os.name == 'posix':
230
+ dir_fd = os.open(dir_name, os.O_RDONLY)
231
+ try:
232
+ os.fsync(dir_fd)
233
+ finally:
234
+ os.close(dir_fd)
235
+ except OSError:
166
236
  pass
237
+ except:
238
+ os.remove(temp_path) # Clean up temp file on failure
239
+ raise
167
240
 
168
- obj_name = self._build_full_objectname(key)
169
- self.s3_client.download_file(self.bucket_name, obj_name, file_name)
170
- result = self.local_cache._read_from_file(file_name)
171
- if not self.immutable_items:
172
- os.remove(file_name)
241
+ self._write_etag_file(file_name, s3_etag)
173
242
 
174
- return result
243
+ return self.local_cache._read_from_file(file_name)
175
244
 
176
245
 
177
246
  def __setitem__(self, key:PersiDictKey, value:Any):
@@ -196,28 +265,27 @@ class S3Dict(PersiDict):
196
265
  + f"but it is {type(value)} instead." )
197
266
 
198
267
  key = SafeStrTuple(key)
199
- file_name = self.local_cache._build_full_path(key, create_subdirs=True)
200
- obj_name = self._build_full_objectname(key)
201
268
 
202
- if self.immutable_items:
203
- key_is_present = False
204
- if os.path.exists(file_name):
205
- key_is_present = True
206
- else:
207
- try:
208
- self.s3_client.head_object(
209
- Bucket=self.bucket_name, Key=obj_name)
210
- key_is_present = True
211
- except:
212
- key_is_present = False
269
+ if self.immutable_items and key in self:
270
+ raise KeyError("Can't modify an immutable item")
213
271
 
214
- if key_is_present:
215
- raise KeyError("Can't modify an immutable item")
272
+ file_name = self.local_cache._build_full_path(key, create_subdirs=True)
273
+ obj_name = self._build_full_objectname(key)
216
274
 
217
275
  self.local_cache._save_to_file(file_name, value)
218
276
  self.s3_client.upload_file(file_name, self.bucket_name, obj_name)
219
- if not self.immutable_items:
220
- os.remove(file_name)
277
+
278
+ try:
279
+ head = self.s3_client.head_object(
280
+ Bucket=self.bucket_name, Key=obj_name)
281
+ s3_etag = head.get("ETag")
282
+ self._write_etag_file(file_name, s3_etag)
283
+ except ClientError:
284
+ # If we can't get ETag, we should remove any existing etag file
285
+ # to force a re-download on the next __getitem__ call.
286
+ etag_file_name = file_name + ".__etag__"
287
+ if os.path.exists(etag_file_name):
288
+ os.remove(etag_file_name)
221
289
 
222
290
 
223
291
  def __delitem__(self, key:PersiDictKey):
@@ -231,10 +299,17 @@ class S3Dict(PersiDict):
231
299
  file_name = self.local_cache._build_full_path(key)
232
300
  if os.path.isfile(file_name):
233
301
  os.remove(file_name)
234
-
302
+ etag_file_name = file_name + ".__etag__"
303
+ if os.path.isfile(etag_file_name):
304
+ os.remove(etag_file_name)
235
305
 
236
306
  def __len__(self) -> int:
237
- """Return len(self). """
307
+ """Return len(self).
308
+
309
+ WARNING: This operation can be very slow and costly on large S3 buckets
310
+ as it needs to iterate over all objects in the dictionary's prefix.
311
+ Avoid using it in performance-sensitive code.
312
+ """
238
313
 
239
314
  num_files = 0
240
315
  suffix = "." + self.file_type
@@ -43,7 +43,7 @@ class SafeStrTuple(Sequence, Hashable):
43
43
  elif isinstance(a, str):
44
44
  assert len(a) > 0
45
45
  assert len(a) < SAFE_STRING_MAX_LENGTH
46
- assert len(set(a) - SAFE_CHARS_SET) == 0
46
+ assert all(c in SAFE_CHARS_SET for c in a)
47
47
  candidate_strings.append(a)
48
48
  elif _is_sequence_not_mapping(a):
49
49
  if len(a) > 0:
@@ -116,17 +116,19 @@ class WriteOnceDict(PersiDict):
116
116
  """
117
117
  check_needed = False
118
118
 
119
- try: # extra protections to better handle concurrent writes
120
- if key in self._wrapped_dict:
121
- check_needed = True
122
- else:
123
- self._wrapped_dict[key] = value
124
- except:
125
- time.sleep(random.random()/random.randint(1,5))
126
- if key in self._wrapped_dict:
127
- check_needed = True
128
- else:
129
- self._wrapped_dict[key] = value
119
+ n_retries = 8
120
+ for i in range(n_retries):
121
+ try: # extra protections to better handle concurrent writes
122
+ if key in self._wrapped_dict:
123
+ check_needed = True
124
+ else:
125
+ self._wrapped_dict[key] = value
126
+ break
127
+ except Exception as e:
128
+ if i < n_retries - 1:
129
+ time.sleep(random.uniform(0.01, 0.1) * (2 ** i))
130
+ else:
131
+ raise e
130
132
 
131
133
  if not key in self._wrapped_dict:
132
134
  raise KeyError(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: persidict
3
- Version: 0.32.8
3
+ Version: 0.34.1
4
4
  Summary: Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud.
5
5
  Keywords: persistence,dicts,distributed,parallel
6
6
  Author: Vlad (Volodymyr) Pavlov
@@ -21,7 +21,6 @@ Requires-Dist: joblib
21
21
  Requires-Dist: numpy
22
22
  Requires-Dist: pandas
23
23
  Requires-Dist: jsonpickle
24
- Requires-Dist: joblib
25
24
  Requires-Dist: deepdiff
26
25
  Requires-Dist: boto3 ; extra == 'aws'
27
26
  Requires-Dist: boto3 ; extra == 'dev'
@@ -1,14 +1,14 @@
1
1
  persidict/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
2
2
  persidict/__init__.py,sha256=CDOSJGgCnyRTkGUTzaeg3Cqsxwx0-0EFieOtldXwAls,1380
3
- persidict/file_dir_dict.py,sha256=SijMlIbte9kNbEyryjK4-cXakQl35gCAo1HaO1WcaU4,17589
3
+ persidict/file_dir_dict.py,sha256=Dr4gdIC5uqykRgNca1pI_M_jEd_9FGjys0BvzAJR0JU,17804
4
4
  persidict/jokers.py,sha256=kX4bE-jKWTM2ki7JOmm_2uJS8zm8u6InZ_V12xo2ImI,1436
5
5
  persidict/overlapping_multi_dict.py,sha256=a-lUbmY15_HrDq6jSIt8F8tJboqbeYiuRQeW4elf_oU,2663
6
6
  persidict/persi_dict.py,sha256=SF6aWs6kCeeW-bZ9HJwx0sPX7Xav_aURqeSZ-j5quv0,14266
7
- persidict/s3_dict.py,sha256=awEIu7ehh4FgKircC10lcHp86xvUctXA_6jxUmC3Hy8,12480
7
+ persidict/s3_dict.py,sha256=fzU3GKKNor6WRoIpP_-7d8ckO4eyhBOOo5kbpsmdxQA,15434
8
8
  persidict/safe_chars.py,sha256=HjK1MwROYy_U9ui-rhg1i3nGkj52K4OFWD-wCCcnJ7Y,536
9
- persidict/safe_str_tuple.py,sha256=cTk5BL3r-yE62EKf7VngTaUpZAdsATJPIjiCGqQzkyU,3717
9
+ persidict/safe_str_tuple.py,sha256=xyIzxlCKmvnNHkFFKWtcBREefxZ0-HLxoH_epYDt8qg,3719
10
10
  persidict/safe_str_tuple_signing.py,sha256=5uCjAVZRqOou-KpDZw-Exboc3-3vuayJMqrrt8aZ0ck,3742
11
- persidict/write_once_dict.py,sha256=NBzaw38zxWVbvCj8OR4T-7w6K41qNrr0gpyr23CRcNQ,6424
12
- persidict-0.32.8.dist-info/WHEEL,sha256=Jb20R3Ili4n9P1fcwuLup21eQ5r9WXhs4_qy7VTrgPI,79
13
- persidict-0.32.8.dist-info/METADATA,sha256=Bp5vJrqVsfXCNle3grOIArHx-l931DmhJ9NUCSs-uMY,9334
14
- persidict-0.32.8.dist-info/RECORD,,
11
+ persidict/write_once_dict.py,sha256=-XHQhTEdvPHTKqLXK4WWW0k5cFitkzalVJC1n4BbKGo,6496
12
+ persidict-0.34.1.dist-info/WHEEL,sha256=Jb20R3Ili4n9P1fcwuLup21eQ5r9WXhs4_qy7VTrgPI,79
13
+ persidict-0.34.1.dist-info/METADATA,sha256=a6o3joH0aBP1HEWHYxJgKn8KkktLA79TLenbkuttITM,9312
14
+ persidict-0.34.1.dist-info/RECORD,,