persidict 0.31.0__tar.gz → 0.32.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of persidict might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: persidict
3
- Version: 0.31.0
3
+ Version: 0.32.0
4
4
  Summary: Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud.
5
5
  Keywords: persistence,dicts,distributed,parallel
6
6
  Author: Vlad (Volodymyr) Pavlov
@@ -20,7 +20,8 @@ Requires-Dist: joblib
20
20
  Requires-Dist: numpy
21
21
  Requires-Dist: pandas
22
22
  Requires-Dist: jsonpickle
23
- Requires-Dist: parameterizable
23
+ Requires-Dist: joblib
24
+ Requires-Dist: deepdiff
24
25
  Requires-Dist: boto3 ; extra == 'aws'
25
26
  Requires-Dist: boto3 ; extra == 'dev'
26
27
  Requires-Dist: moto ; extra == 'dev'
@@ -37,7 +38,7 @@ Simple persistent dictionaries for Python.
37
38
 
38
39
  ## What Is It?
39
40
 
40
- `persidict` offers a simple persistent key-value store for Python.
41
+ `persidict` offers a very simple persistent key-value store for Python.
41
42
  It saves the content of the dictionary in a folder on a disk
42
43
  or in an S3 bucket on AWS. Each value is stored as a separate file / S3 object.
43
44
  Only text strings or sequences of strings are allowed as keys.
@@ -4,7 +4,7 @@ Simple persistent dictionaries for Python.
4
4
 
5
5
  ## What Is It?
6
6
 
7
- `persidict` offers a simple persistent key-value store for Python.
7
+ `persidict` offers a very simple persistent key-value store for Python.
8
8
  It saves the content of the dictionary in a folder on a disk
9
9
  or in an S3 bucket on AWS. Each value is stored as a separate file / S3 object.
10
10
  Only text strings or sequences of strings are allowed as keys.
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "persidict"
7
- version = "0.31.0"
7
+ version = "0.32.0"
8
8
  description = "Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -30,7 +30,8 @@ dependencies = [
30
30
  "numpy",
31
31
  "pandas",
32
32
  "jsonpickle",
33
- "parameterizable"
33
+ "joblib",
34
+ "deepdiff"
34
35
  ]
35
36
 
36
37
  [project.urls]
@@ -45,4 +46,4 @@ dev = [
45
46
 
46
47
  aws = [
47
48
  "boto3"
48
- ]
49
+ ]
@@ -19,6 +19,7 @@ import jsonpickle
19
19
  import jsonpickle.ext.numpy as jsonpickle_numpy
20
20
  import jsonpickle.ext.pandas as jsonpickle_pandas
21
21
  import parameterizable
22
+ from parameterizable import sort_dict_by_keys
22
23
 
23
24
  from .jokers import KEEP_CURRENT, DELETE_CURRENT, Joker
24
25
  from .safe_chars import replace_unsafe_chars
@@ -124,7 +125,7 @@ class FileDirDict(PersiDict):
124
125
  base_dir=self.base_dir
125
126
  , file_type=self.file_type)
126
127
  params.update(additional_params)
127
- sorted_params = dict(sorted(params.items()))
128
+ sorted_params = sort_dict_by_keys(params)
128
129
  return sorted_params
129
130
 
130
131
 
@@ -151,11 +152,20 @@ class FileDirDict(PersiDict):
151
152
 
152
153
  num_files = 0
153
154
  suffix = "." + self.file_type
154
- for subdir_info in os.walk(self._base_dir):
155
- files = subdir_info[2]
156
- files = [f_name for f_name in files
157
- if f_name.endswith(suffix)]
158
- num_files += len(files)
155
+ stack = [self._base_dir]
156
+
157
+ while stack:
158
+ path = stack.pop()
159
+ try:
160
+ with os.scandir(path) as it:
161
+ for entry in it:
162
+ if entry.is_dir(follow_symlinks=False):
163
+ stack.append(entry.path)
164
+ elif entry.is_file(follow_symlinks=False) and entry.name.endswith(suffix):
165
+ num_files += 1
166
+ except PermissionError:
167
+ continue
168
+
159
169
  return num_files
160
170
 
161
171
 
@@ -202,7 +212,39 @@ class FileDirDict(PersiDict):
202
212
  file_name = key[-1] + "." + self.file_type
203
213
  return os.path.join(*dir_names, file_name)
204
214
  else:
205
- return os.path.join(*dir_names)
215
+ return str(os.path.join(*dir_names))
216
+
217
+
218
+ def _build_key_from_full_path(self, full_path:str)->SafeStrTuple:
219
+ """Convert a filesystem path back into a key."""
220
+
221
+ # Ensure we're working with absolute paths
222
+ full_path = os.path.abspath(full_path)
223
+
224
+ # Remove the base directory from the path
225
+ if not full_path.startswith(self._base_dir):
226
+ raise ValueError(f"Path {full_path} is not within base directory {self._base_dir}")
227
+
228
+ # Get the relative path
229
+ rel_path = full_path[len(self._base_dir):].lstrip(os.sep)
230
+
231
+ if not rel_path:
232
+ return SafeStrTuple()
233
+
234
+ # Split the path into components
235
+ path_components = rel_path.split(os.sep)
236
+
237
+ # If it's a file path, remove the file extension from the last component
238
+ if os.path.isfile(full_path) and path_components[-1].endswith("." + self.file_type):
239
+ path_components[-1] = path_components[-1][:-len("." + self.file_type)]
240
+
241
+ # Create a SafeStrTuple from the path components
242
+ key = SafeStrTuple(*path_components)
243
+
244
+ # Unsign the key
245
+ key = unsign_safe_str_tuple(key, self.digest_len)
246
+
247
+ return key
206
248
 
207
249
 
208
250
  def get_subdict(self, key:PersiDictKey) -> FileDirDict:
@@ -403,4 +445,46 @@ class FileDirDict(PersiDict):
403
445
  return os.path.getmtime(filename)
404
446
 
405
447
 
406
- parameterizable.register_parameterizable_class(FileDirDict)
448
+ def random_key(self) -> PersiDictKey | None:
449
+ # canonicalise extension once
450
+ early_exit_cap = 10_000
451
+ ext = None
452
+ if self.file_type:
453
+ ext = self.file_type.lower()
454
+ if not ext.startswith("."):
455
+ ext = "." + ext
456
+
457
+ stack = [self._base_dir]
458
+ winner: Optional[str] = None
459
+ seen = 0
460
+
461
+ while stack:
462
+ path = stack.pop()
463
+ try:
464
+ with os.scandir(path) as it:
465
+ for ent in it:
466
+ if ent.is_dir(follow_symlinks=False):
467
+ stack.append(ent.path)
468
+ continue
469
+
470
+ # cheap name test before stat()
471
+ if ext and not ent.name.lower().endswith(ext):
472
+ continue
473
+
474
+ if ent.is_file(follow_symlinks=False):
475
+ seen += 1
476
+ if random.random() < 1 / seen: # reservoir k=1
477
+ winner = ent.path
478
+ # early‑exit when cap reached
479
+ if early_exit_cap and seen >= early_exit_cap:
480
+ return self._build_key_from_full_path(os.path.abspath(winner))
481
+ except PermissionError:
482
+ continue
483
+
484
+ if winner is None:
485
+ return None
486
+ else:
487
+ return self._build_key_from_full_path(os.path.abspath(winner))
488
+
489
+
490
+ parameterizable.register_parameterizable_class(FileDirDict)
@@ -22,7 +22,7 @@ from __future__ import annotations
22
22
 
23
23
  from abc import abstractmethod
24
24
  import random
25
- from parameterizable import ParameterizableClass
25
+ from parameterizable import ParameterizableClass, sort_dict_by_keys
26
26
  from typing import Any, Sequence, Optional
27
27
  from collections.abc import MutableMapping
28
28
 
@@ -107,7 +107,7 @@ class PersiDict(MutableMapping, ParameterizableClass):
107
107
  , digest_len=self.digest_len
108
108
  , base_class_for_values=self.base_class_for_values
109
109
  )
110
- sorted_params = dict(sorted(params.items()))
110
+ sorted_params = sort_dict_by_keys(params)
111
111
  return sorted_params
112
112
 
113
113
 
@@ -302,15 +302,33 @@ class PersiDict(MutableMapping, ParameterizableClass):
302
302
  return result_subdicts
303
303
 
304
304
 
305
- def random_keys(self, max_n:int):
306
- """Return a list of random keys from the dictionary.
305
+ def random_key(self) -> PersiDictKey | None:
306
+ """Return a random key from the dictionary.
307
+
308
+ Returns a single random key if the dictionary is not empty.
309
+ Returns None if the dictionary is empty.
307
310
 
308
311
  This method is absent in the original Python dict API.
312
+
313
+ Implementation uses reservoir sampling to select a uniformly random key
314
+ in streaming time, without loading all keys into memory or using len().
309
315
  """
310
- all_keys = list(self.keys())
311
- if max_n > len(all_keys):
312
- max_n = len(all_keys)
313
- result = random.sample(all_keys, max_n)
316
+ iterator = iter(self.keys())
317
+ try:
318
+ # Get the first key
319
+ result = next(iterator)
320
+ except StopIteration:
321
+ # Dictionary is empty
322
+ return None
323
+
324
+ # Reservoir sampling algorithm
325
+ i = 2
326
+ for key in iterator:
327
+ # Select current key with probability 1/i
328
+ if random.random() < 1/i:
329
+ result = key
330
+ i += 1
331
+
314
332
  return result
315
333
 
316
334
 
@@ -372,4 +390,4 @@ class PersiDict(MutableMapping, ParameterizableClass):
372
390
 
373
391
  This method is absent in the original Python dict API.
374
392
  """
375
- return [self[k] for k in self.newest_keys(max_n)]
393
+ return [self[k] for k in self.newest_keys(max_n)]
@@ -5,6 +5,7 @@ from typing import Any, Optional
5
5
 
6
6
  import boto3
7
7
  import parameterizable
8
+ from parameterizable.dict_sorter import sort_dict_by_keys
8
9
 
9
10
  from .safe_str_tuple import SafeStrTuple
10
11
  from .safe_str_tuple_signing import sign_safe_str_tuple, unsign_safe_str_tuple
@@ -119,7 +120,7 @@ class S3Dict(PersiDict):
119
120
  params["region"] = self.region
120
121
  params["bucket_name"] = self.bucket_name
121
122
  params["root_prefix"] = self.root_prefix
122
- sorted_params = dict(sorted(params.items()))
123
+ sorted_params = sort_dict_by_keys(params)
123
124
  return sorted_params
124
125
 
125
126
 
@@ -3,7 +3,8 @@ from __future__ import annotations
3
3
  import time
4
4
 
5
5
  from deepdiff import DeepDiff
6
- from parameterizable import register_parameterizable_class
6
+ from parameterizable import register_parameterizable_class, sort_dict_by_keys
7
+
7
8
  from .jokers import KEEP_CURRENT, KeepCurrentFlag
8
9
  from .persi_dict import PersiDict
9
10
  from .file_dir_dict import FileDirDict
@@ -103,7 +104,7 @@ class WriteOnceDict(PersiDict):
103
104
  params = dict(
104
105
  wrapped_dict = self._wrapped_dict,
105
106
  p_consistency_checks = self.p_consistency_checks)
106
- sorted_params = dict(sorted(params.items()))
107
+ sorted_params = sort_dict_by_keys(params)
107
108
  return sorted_params
108
109
 
109
110
  def __setitem__(self, key, value):
Binary file