persidict 0.34.2__py3-none-any.whl → 0.34.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of persidict might be problematic. Click here for more details.

persidict/s3_dict.py CHANGED
@@ -48,27 +48,32 @@ class S3Dict(PersiDict):
48
48
  , digest_len:int = 8
49
49
  , base_class_for_values:Optional[type] = None
50
50
  ,*args ,**kwargs):
51
- """A constructor defines location of the store and object format to use.
52
-
53
- bucket_name and region define an S3 location of the storage
54
- that will contain all the objects in the S3_Dict.
55
- If the bucket does not exist, it will be created.
56
-
57
- root_prefix is a common S3 prefix for all objectnames in a dictionary.
58
-
59
- _base_dir is a local directory that will be used to store tmp files.
60
-
61
- base_class_for_values constraints the type of values that can be
62
- stored in the dictionary. If specified, it will be used to
63
- check types of values in the dictionary. If not specified,
64
- no type checking will be performed and all types will be allowed.
65
-
66
- file_type is an extension, which will be used for all files in the dictionary.
67
- If file_type has one of two values: "lz4" or "json", it defines
68
- which file format will be used by FileDirDict to store values.
69
- For all other values of file_type, the file format will always be plain
70
- text. "lz4" or "json" allow storing arbitrary Python objects,
71
- while all other file_type-s only work with str objects.
51
+ """Initialize an S3-backed persistent dictionary.
52
+
53
+ Args:
54
+ bucket_name (str): Name of the S3 bucket to use. The bucket will be
55
+ created if it does not already exist.
56
+ region (str | None): AWS region of the bucket. If None, the default
57
+ client region is used.
58
+ root_prefix (str): Common S3 key prefix under which all objects are
59
+ stored. A trailing slash is added if missing.
60
+ base_dir (str): Local directory used for temporary files and a
61
+ small on-disk cache.
62
+ file_type (str): Extension/format for stored values. "pkl" or
63
+ "json" store arbitrary Python objects; other values imply plain
64
+ text and only allow str values.
65
+ immutable_items (bool): If True, disallow changing existing items.
66
+ digest_len (int): Number of base32 MD5 characters appended to key
67
+ elements to avoid case-insensitive collisions. Use 0 to disable.
68
+ base_class_for_values (type | None): Optional base class that all
69
+ values must inherit from. If provided and not str, file_type
70
+ must be "pkl" or "json".
71
+ *args: Ignored; reserved for compatibility.
72
+ **kwargs: Ignored; reserved for compatibility.
73
+
74
+ Raises:
75
+ ValueError: If file_type is "__etag__" (reserved) or configuration
76
+ is inconsistent with base_class_for_values.
72
77
  """
73
78
 
74
79
  super().__init__(immutable_items = immutable_items, digest_len = 0)
@@ -107,6 +112,11 @@ class S3Dict(PersiDict):
107
112
 
108
113
  This method is needed to support Parameterizable API.
109
114
  The method is absent in the original dict API.
115
+
116
+ Returns:
117
+ dict: A mapping of parameter names to their configured values,
118
+ including region, bucket_name, and root_prefix combined with
119
+ parameters from the local cache.
110
120
  """
111
121
  params = self.local_cache.get_params()
112
122
  params["region"] = self.region
@@ -118,9 +128,12 @@ class S3Dict(PersiDict):
118
128
 
119
129
  @property
120
130
  def base_url(self):
121
- """Return dictionary's URl.
131
+ """Return the S3 URL prefix of this dictionary.
122
132
 
123
133
  This property is absent in the original dict API.
134
+
135
+ Returns:
136
+ str: The base S3 URL in the form "s3://<bucket>/<root_prefix>".
124
137
  """
125
138
  return f"s3://{self.bucket_name}/{self.root_prefix}"
126
139
 
@@ -130,12 +143,22 @@ class S3Dict(PersiDict):
130
143
  """Return dictionary's base directory in the local filesystem.
131
144
 
132
145
  This property is absent in the original dict API.
146
+
147
+ Returns:
148
+ str: Path to the local on-disk cache directory used by S3Dict.
133
149
  """
134
150
  return self.local_cache.base_dir
135
151
 
136
152
 
137
153
  def _build_full_objectname(self, key:PersiDictKey) -> str:
138
- """ Convert PersiDictKey into an S3 objectname. """
154
+ """Convert a key into a full S3 object key (object name).
155
+
156
+ Args:
157
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
158
+
159
+ Returns:
160
+ str: The full S3 key under root_prefix with file_type suffix applied.
161
+ """
139
162
  key = SafeStrTuple(key)
140
163
  key = sign_safe_str_tuple(key, self.digest_len)
141
164
  objectname = self.root_prefix + "/".join(key)+ "." + self.file_type
@@ -143,7 +166,14 @@ class S3Dict(PersiDict):
143
166
 
144
167
 
145
168
  def __contains__(self, key:PersiDictKey) -> bool:
146
- """True if the dictionary has the specified key, else False. """
169
+ """Return True if the specified key exists in S3.
170
+
171
+ Args:
172
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
173
+
174
+ Returns:
175
+ bool: True if the object exists (or is cached when immutable), else False.
176
+ """
147
177
  key = SafeStrTuple(key)
148
178
  if self.immutable_items:
149
179
  file_name = self.local_cache._build_full_path(
@@ -159,7 +189,12 @@ class S3Dict(PersiDict):
159
189
 
160
190
 
161
191
  def _write_etag_file(self, file_name: str, etag: str):
162
- """Atomically write the ETag to its cache file."""
192
+ """Atomically write the ETag to its cache file.
193
+
194
+ Args:
195
+ file_name (str): Path to the cached data file (without the ETag suffix).
196
+ etag (str): The S3 ETag value to persist alongside the cached file.
197
+ """
163
198
  if not etag:
164
199
  return
165
200
  etag_file_name = file_name + ".__etag__"
@@ -187,7 +222,18 @@ class S3Dict(PersiDict):
187
222
 
188
223
 
189
224
  def __getitem__(self, key:PersiDictKey) -> Any:
190
- """X.__getitem__(y) is an equivalent to X[y]. """
225
+ """Retrieve the value stored for a key from S3 or local cache.
226
+
227
+ If immutable_items is True and a local cached file exists, that cache is
228
+ returned. Otherwise, the object is fetched from S3, with conditional
229
+ requests used when possible.
230
+
231
+ Args:
232
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
233
+
234
+ Returns:
235
+ Any: The stored value.
236
+ """
191
237
 
192
238
  key = SafeStrTuple(key)
193
239
  file_name = self.local_cache._build_full_path(key, create_subdirs=True)
@@ -257,7 +303,23 @@ class S3Dict(PersiDict):
257
303
 
258
304
 
259
305
  def __setitem__(self, key:PersiDictKey, value:Any):
260
- """Set self[key] to value. """
306
+ """Store a value for a key in S3 and update the local cache.
307
+
308
+ Interprets joker values KEEP_CURRENT and DELETE_CURRENT accordingly.
309
+ Validates a value type if base_class_for_values is set, then writes to the
310
+ local cache and uploads to S3. If possible, caches the S3 ETag locally to
311
+ enable conditional GETs later.
312
+
313
+ Args:
314
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
315
+ value (Any): Value to store, or a joker command.
316
+
317
+ Raises:
318
+ KeyError: If attempting to modify an existing item when
319
+ immutable_items is True.
320
+ TypeError: If value is a PersiDict or does not match
321
+ base_class_for_values when it is set.
322
+ """
261
323
 
262
324
  if value is KEEP_CURRENT:
263
325
  return
@@ -302,7 +364,14 @@ class S3Dict(PersiDict):
302
364
 
303
365
 
304
366
  def __delitem__(self, key:PersiDictKey):
305
- """Delete self[key]. """
367
+ """Delete the stored value for a key from S3 and local cache.
368
+
369
+ Args:
370
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
371
+
372
+ Raises:
373
+ KeyError: If immutable_items is True.
374
+ """
306
375
 
307
376
  key = SafeStrTuple(key)
308
377
  if self.immutable_items:
@@ -322,6 +391,9 @@ class S3Dict(PersiDict):
322
391
  WARNING: This operation can be very slow and costly on large S3 buckets
323
392
  as it needs to iterate over all objects in the dictionary's prefix.
324
393
  Avoid using it in performance-sensitive code.
394
+
395
+ Returns:
396
+ int: Number of stored items under this dictionary's root_prefix.
325
397
  """
326
398
 
327
399
  num_files = 0
@@ -344,7 +416,24 @@ class S3Dict(PersiDict):
344
416
 
345
417
 
346
418
  def _generic_iter(self, result_type: str):
347
- """Underlying implementation for .items()/.keys()/.values() iterators"""
419
+ """Underlying implementation for .items()/.keys()/.values() iterators.
420
+
421
+ Iterates over S3 objects under the configured root_prefix and yields
422
+ keys, values, and/or timestamps according to the requested result_type.
423
+ Keys are mapped to SafeStrTuple by removing the file extension and
424
+ unsigning based on digest_len.
425
+
426
+ Args:
427
+ result_type (set[str]): Any non-empty subset of {"keys", "values",
428
+ "timestamps"} specifying which fields to yield.
429
+
430
+ Returns:
431
+ Iterator: A generator yielding:
432
+ - SafeStrTuple if result_type == {"keys"}
433
+ - Any if result_type == {"values"}
434
+ - tuple[SafeStrTuple, Any] if result_type == {"keys", "values"}
435
+ - tuple[..., float] including POSIX timestamp if "timestamps" is requested.
436
+ """
348
437
 
349
438
  assert isinstance(result_type, set)
350
439
  assert 1 <= len(result_type) <= 3
@@ -356,11 +445,20 @@ class S3Dict(PersiDict):
356
445
  prefix_len = len(self.root_prefix)
357
446
 
358
447
  def splitter(full_name: str) -> SafeStrTuple:
448
+ """Convert an S3 object key into a SafeStrTuple without the suffix.
449
+
450
+ Args:
451
+ full_name (str): Full S3 object key (including root_prefix).
452
+
453
+ Returns:
454
+ SafeStrTuple: The parsed key parts, still signed.
455
+ """
359
456
  assert full_name.startswith(self.root_prefix)
360
457
  result = full_name[prefix_len:-ext_len].split(sep="/")
361
458
  return SafeStrTuple(result)
362
459
 
363
460
  def step():
461
+ """Generator that pages through S3 and yields entries based on result_type."""
364
462
  paginator = self.s3_client.get_paginator("list_objects_v2")
365
463
  page_iterator = paginator.paginate(
366
464
  Bucket=self.bucket_name, Prefix = self.root_prefix)
@@ -400,9 +498,16 @@ class S3Dict(PersiDict):
400
498
  def get_subdict(self, key:PersiDictKey) -> S3Dict:
401
499
  """Get a subdictionary containing items with the same prefix key.
402
500
 
403
- For non-existing prefix key, an empty sub-dictionary is returned.
404
-
501
+ For a non-existing prefix key, an empty sub-dictionary is returned.
405
502
  This method is absent in the original dict API.
503
+
504
+ Args:
505
+ key (PersiDictKey): A common prefix (string or sequence of strings)
506
+ used to scope items stored under this dictionary.
507
+
508
+ Returns:
509
+ S3Dict: A new S3Dict instance rooted at the given prefix, sharing
510
+ the same bucket, region, serialization, and immutability settings.
406
511
  """
407
512
 
408
513
  key = SafeStrTuple(key)
@@ -430,11 +535,18 @@ class S3Dict(PersiDict):
430
535
 
431
536
 
432
537
  def timestamp(self,key:PersiDictKey) -> float:
433
- """Get last modification time (in seconds, Unix epoch time).
538
+ """Get last modification time (Unix epoch seconds) for a key.
434
539
 
435
540
  This method is absent in the original dict API.
541
+
542
+ Args:
543
+ key (PersiDictKey): Key (string or sequence of strings) or SafeStrTuple.
544
+
545
+ Returns:
546
+ float: POSIX timestamp (seconds since the Unix epoch) of the last
547
+ modification time as reported by S3 for the object.
436
548
  """
437
- #TODO: check work with timezones
549
+ # TODO: check work with timezones
438
550
  key = SafeStrTuple(key)
439
551
  obj_name = self._build_full_objectname(key)
440
552
  response = self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
persidict/safe_chars.py CHANGED
@@ -4,11 +4,30 @@ SAFE_CHARS_SET = set(string.ascii_letters + string.digits + "()_-~.=")
4
4
  SAFE_STRING_MAX_LENGTH = 254
5
5
 
6
6
  def get_safe_chars() -> set[str]:
7
- """Return a set of allowed characters."""
7
+ """Get the set of allowed characters.
8
+
9
+ Returns:
10
+ set[str]: A copy of the set of characters considered safe for
11
+ building file names and URL components. Includes ASCII letters,
12
+ digits, and the characters ()_-~.= .
13
+ """
8
14
  return SAFE_CHARS_SET.copy()
9
15
 
10
- def replace_unsafe_chars(a_str:str, replace_with:str) -> str :
11
- """ Replace unsafe (special) characters with allowed (safe) ones."""
16
+ def replace_unsafe_chars(a_str: str, replace_with: str) -> str:
17
+ """Replace unsafe characters in a string.
18
+
19
+ Replaces any character not present in the safe-character set with a
20
+ replacement substring.
21
+
22
+ Args:
23
+ a_str (str): Input string that may contain unsafe characters.
24
+ replace_with (str): The substring to use for every unsafe character
25
+ encountered in a_str.
26
+
27
+ Returns:
28
+ str: The transformed string where all unsafe characters are replaced
29
+ by the provided replacement substring.
30
+ """
12
31
  safe_chars = get_safe_chars()
13
32
  result_list = [(c if c in safe_chars else replace_with) for c in a_str]
14
33
  result_str = "".join(result_list)
@@ -1,4 +1,9 @@
1
- """SafeStrTuple: an immutable flat tuple of non-emtpy URL/filename-safe strings.
1
+ """Utilities for strict, flat tuples of URL/filename-safe strings.
2
+
3
+ This module defines SafeStrTuple, an immutable, hashable, flat tuple of non-empty
4
+ strings restricted to a predefined safe character set and bounded length. It is
5
+ useful for constructing keys and paths that must be portable and safe for URLs
6
+ and filesystems.
2
7
  """
3
8
  from __future__ import annotations
4
9
  from collections.abc import Sequence, Mapping, Hashable
@@ -6,33 +11,69 @@ from typing import Any
6
11
  from .safe_chars import SAFE_CHARS_SET, SAFE_STRING_MAX_LENGTH
7
12
 
8
13
 
9
- def _is_sequence_not_mapping(obj:Any) -> bool:
10
- """Check if obj is a sequence (e.g. list) but not a mapping (e.g. dict)."""
14
+ def _is_sequence_not_mapping(obj: Any) -> bool:
15
+ """Return True if the object looks like a sequence but not a mapping.
16
+
17
+ This function prefers ABC checks but falls back to duck-typing to handle
18
+ some custom/typed collections.
19
+
20
+ Args:
21
+ obj: Object to inspect.
22
+
23
+ Returns:
24
+ bool: True if obj is a sequence (e.g., list, tuple) and not a mapping
25
+ (e.g., dict); otherwise False.
26
+ """
11
27
  if isinstance(obj, Sequence) and not isinstance(obj, Mapping):
12
28
  return True
13
29
  elif hasattr(obj, "keys") and callable(obj.keys):
14
30
  return False
15
- elif (hasattr(obj, "__getitem__") and callable(obj.__getitem__)
16
- and hasattr(obj, "__len__") and callable(obj.__len__)
17
- and hasattr(obj, "__iter__") and callable(obj.__iter__)):
31
+ elif (
32
+ hasattr(obj, "__getitem__")
33
+ and callable(obj.__getitem__)
34
+ and hasattr(obj, "__len__")
35
+ and callable(obj.__len__)
36
+ and hasattr(obj, "__iter__")
37
+ and callable(obj.__iter__)
38
+ ):
18
39
  return True
19
40
  else:
20
41
  return False
21
42
 
43
+
22
44
  class SafeStrTuple(Sequence, Hashable):
23
- """An immutable sequence of non-emtpy URL/filename-safe strings.
45
+ """An immutable sequence of non-empty URL/filename-safe strings.
46
+
47
+ The sequence is flat (no nested structures) and hashable, making it suitable
48
+ for use as a dictionary key. All strings are validated to contain only
49
+ characters from SAFE_CHARS_SET and to have length less than
50
+ SAFE_STRING_MAX_LENGTH.
24
51
  """
25
52
 
26
53
  strings: tuple[str, ...]
27
54
 
28
55
  def __init__(self, *args, **kwargs):
29
- """Create a SafeStrTuple from a sequence/tree of strings.
30
-
31
- The constructor accepts a sequence (list, tuple, etc.) of objects,
32
- each of which can be a string or a nested sequence of
33
- objects with similar structure. The input tree of strings is flattened.
34
- Each string must be non-empty and contain
35
- only URL/filename-safe characters.
56
+ """Initialize from strings or nested sequences of strings.
57
+
58
+ The constructor accepts one or more arguments which may be:
59
+ - a SafeStrTuple
60
+ - a single string
61
+ - a sequence (list/tuple/etc.) containing any of the above recursively
62
+
63
+ The input is flattened left-to-right into a single tuple of validated
64
+ strings. Empty strings and strings with characters outside
65
+ SAFE_CHARS_SET are rejected. Strings must also be shorter than
66
+ SAFE_STRING_MAX_LENGTH.
67
+
68
+ Args:
69
+ *args: One or more inputs (strings, sequences, or SafeStrTuple) that
70
+ will be flattened into a tuple of safe strings.
71
+ **kwargs: Not supported.
72
+
73
+ Raises:
74
+ AssertionError: If kwargs are provided; if no args are provided; if
75
+ any string is empty, too long, or contains disallowed chars; or
76
+ if an argument has an invalid type.
36
77
  """
37
78
  assert len(kwargs) == 0
38
79
  assert len(args) > 0
@@ -54,28 +95,60 @@ class SafeStrTuple(Sequence, Hashable):
54
95
 
55
96
  @property
56
97
  def str_chain(self) -> tuple[str, ...]:
57
- """for backward compatibility"""
98
+ """Alias for strings for backward compatibility.
99
+
100
+ Returns:
101
+ tuple[str, ...]: The underlying tuple of strings.
102
+ """
58
103
  return self.strings
59
104
 
60
- def __getitem__(self, key:int)-> str:
61
- """Return a string at position key."""
105
+ def __getitem__(self, key: int) -> str:
106
+ """Return the string at the given index.
107
+
108
+ Args:
109
+ key: Zero-based index.
110
+
111
+ Returns:
112
+ str: The string at the specified position.
113
+ """
62
114
  return self.strings[key]
63
115
 
64
116
  def __len__(self) -> int:
65
- """Return the number of strings in the tuple."""
117
+ """Return the number of strings in the tuple.
118
+
119
+ Returns:
120
+ int: The number of elements.
121
+ """
66
122
  return len(self.strings)
67
123
 
68
124
  def __hash__(self):
69
- """Return a hash of the tuple."""
125
+ """Compute the hash of the underlying tuple.
126
+
127
+ Returns:
128
+ int: A hash value suitable for dict/set usage.
129
+ """
70
130
  return hash(self.strings)
71
131
 
72
132
  def __repr__(self) -> str:
73
- """Return repr(self)."""
74
- return f"{type(self).__name__}({self.strings})"
133
+ """Return a developer-friendly representation.
75
134
 
135
+ Returns:
136
+ str: A representation including the class name and contents.
137
+ """
138
+ return f"{type(self).__name__}({self.strings})"
76
139
 
77
140
  def __eq__(self, other) -> bool:
78
- """Return self == other."""
141
+ """Compare two SafeStrTuple-compatible objects for equality.
142
+
143
+ If other is not a SafeStrTuple, it will be coerced using the same
144
+ validation rules.
145
+
146
+ Args:
147
+ other: Another SafeStrTuple or compatible input.
148
+
149
+ Returns:
150
+ bool: True if both contain the same sequence of strings.
151
+ """
79
152
  if isinstance(other, SafeStrTuple):
80
153
  if type(self).__eq__ != type(other).__eq__:
81
154
  return other.__eq__(self)
@@ -84,25 +157,53 @@ class SafeStrTuple(Sequence, Hashable):
84
157
 
85
158
  return self.strings == other.strings
86
159
 
87
-
88
160
  def __add__(self, other) -> SafeStrTuple:
89
- """Return self + other."""
161
+ """Concatenate with another SafeStrTuple-compatible object.
162
+
163
+ Args:
164
+ other: Another SafeStrTuple or compatible input.
165
+
166
+ Returns:
167
+ SafeStrTuple: A new instance containing elements of self then other.
168
+ """
90
169
  other = SafeStrTuple(other)
91
170
  return SafeStrTuple(*(self.strings + other.strings))
92
171
 
93
172
  def __radd__(self, other) -> SafeStrTuple:
94
- """Return other + self."""
173
+ """Concatenate with another object in reversed order (other + self).
174
+
175
+ Args:
176
+ other: Another SafeStrTuple or compatible input.
177
+
178
+ Returns:
179
+ SafeStrTuple: A new instance containing elements of other then self.
180
+ """
95
181
  other = SafeStrTuple(other)
96
182
  return SafeStrTuple(*(other.strings + self.strings))
97
183
 
98
184
  def __iter__(self):
99
- """Return iter(self)."""
185
+ """Return an iterator over the strings.
186
+
187
+ Returns:
188
+ Iterator[str]: An iterator over the internal tuple.
189
+ """
100
190
  return iter(self.strings)
101
191
 
102
192
  def __contains__(self, item) -> bool:
103
- """Return item in self."""
193
+ """Check membership.
194
+
195
+ Args:
196
+ item: String to check for presence.
197
+
198
+ Returns:
199
+ bool: True if item is present.
200
+ """
104
201
  return item in self.strings
105
202
 
106
203
  def __reversed__(self) -> SafeStrTuple:
107
- """Return a reversed SafeStrTuple."""
204
+ """Return a reversed SafeStrTuple.
205
+
206
+ Returns:
207
+ SafeStrTuple: A new instance with elements in reverse order.
208
+ """
108
209
  return SafeStrTuple(*reversed(self.strings))