persidict 0.105.0__py3-none-any.whl → 0.105.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of persidict might be problematic. Click here for more details.
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: persidict
|
|
3
|
-
Version: 0.105.
|
|
3
|
+
Version: 0.105.1
|
|
4
4
|
Summary: Simple persistent key-value store for Python. Values are stored as files on a disk or as S3 objects on AWS cloud.
|
|
5
5
|
Keywords: persistence,dicts,distributed,parallel
|
|
6
6
|
Author: Vlad (Volodymyr) Pavlov
|
|
@@ -9,12 +9,11 @@ persidict/local_dict.py,sha256=HOJU_InRLLag7tG6j2D3Pa2MoyRy23SjJyYOKyD6b2A,21683
|
|
|
9
9
|
persidict/overlapping_multi_dict.py,sha256=vh8SeZjhCN1pIZTcU0i4N6GlD9eZcSsdeZYEenk0xjc,6490
|
|
10
10
|
persidict/persi_dict.py,sha256=vUxu0mlveVLoLhppiPdZAJ_6Qfijw8QaTQsmXHKQSQc,29138
|
|
11
11
|
persidict/s3_dict_file_dir_cached.py,sha256=IqTSD3Hw4I98B5pLTA5V8GcmmWDuCa7eKeMaz9uvlc8,8630
|
|
12
|
-
persidict/s3_dict_legacy.py,sha256=G1gXRHQjnv5K8QOUnxtqQaDRWte2orKyBcjxpfXuEF8,22603
|
|
13
12
|
persidict/safe_chars.py,sha256=gKYXA4RDuOVy_vhGXn8y0BFNHyuOvLsFyaJQXL013Go,2129
|
|
14
13
|
persidict/safe_str_tuple.py,sha256=BSRMNgLfZoAwS7ZkiVA3fV2sfj2i_NO1tjstVP-XGOU,8047
|
|
15
14
|
persidict/safe_str_tuple_signing.py,sha256=mpOfx_xyprc0_c60XPB_EihI3vR1gOn6T03iCx1HwwQ,7494
|
|
16
15
|
persidict/singletons.py,sha256=3X60V9S73-0oHoCRapbBT9eQ8HuOf_hZHAgZW_vB5oo,7236
|
|
17
16
|
persidict/write_once_dict.py,sha256=aoYc_tAWCLb2i2jjXgkQPxYb6t7H1KdUdc_dwv56uXE,12667
|
|
18
|
-
persidict-0.105.
|
|
19
|
-
persidict-0.105.
|
|
20
|
-
persidict-0.105.
|
|
17
|
+
persidict-0.105.1.dist-info/WHEEL,sha256=M6du7VZflc4UPsGphmOXHANdgk8zessdJG0DBUuoA-U,78
|
|
18
|
+
persidict-0.105.1.dist-info/METADATA,sha256=_7QMhfVhS_hgIk3OfsJtcc1JAbSXLogQev5erXcGsxw,13715
|
|
19
|
+
persidict-0.105.1.dist-info/RECORD,,
|
persidict/s3_dict_legacy.py
DELETED
|
@@ -1,565 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Optional
|
|
4
|
-
|
|
5
|
-
import boto3
|
|
6
|
-
import joblib
|
|
7
|
-
import jsonpickle
|
|
8
|
-
from botocore.exceptions import ClientError
|
|
9
|
-
|
|
10
|
-
import parameterizable
|
|
11
|
-
from parameterizable.dict_sorter import sort_dict_by_keys
|
|
12
|
-
|
|
13
|
-
from .safe_str_tuple import SafeStrTuple, NonEmptySafeStrTuple
|
|
14
|
-
from .safe_str_tuple_signing import sign_safe_str_tuple, unsign_safe_str_tuple
|
|
15
|
-
from .persi_dict import PersiDict, NonEmptyPersiDictKey
|
|
16
|
-
from .singletons import Joker, EXECUTION_IS_COMPLETE, ETagHasNotChangedFlag
|
|
17
|
-
from .file_dir_dict import FileDirDict, PersiDictKey
|
|
18
|
-
from .overlapping_multi_dict import OverlappingMultiDict
|
|
19
|
-
|
|
20
|
-
S3DICT_DEFAULT_BASE_DIR = "__s3_dict__"
|
|
21
|
-
|
|
22
|
-
class S3Dict_Legacy(PersiDict):
|
|
23
|
-
"""A persistent dictionary that stores key-value pairs as S3 objects with local caching.
|
|
24
|
-
|
|
25
|
-
Each key-value pair is stored as a separate S3 object in the specified bucket.
|
|
26
|
-
S3Dict_Legacy provides intelligent local caching to minimize S3 API calls and improve
|
|
27
|
-
performance by using conditional requests with ETags to detect changes.
|
|
28
|
-
|
|
29
|
-
A key can be either a string (object name without file extension) or a sequence
|
|
30
|
-
of strings representing a hierarchical path (folder structure ending with an
|
|
31
|
-
object name). Values can be instances of any Python type and are serialized
|
|
32
|
-
to S3 objects.
|
|
33
|
-
|
|
34
|
-
S3Dict_Legacy supports multiple serialization formats:
|
|
35
|
-
- Binary storage using pickle ('pkl' format)
|
|
36
|
-
- Human-readable text using jsonpickle ('json' format)
|
|
37
|
-
- Plain text for string values (other formats)
|
|
38
|
-
|
|
39
|
-
Key Features:
|
|
40
|
-
- Local file-based caching for improved read performance
|
|
41
|
-
- ETag-based conditional requests to minimize unnecessary downloads
|
|
42
|
-
- Automatic cache invalidation when S3 objects change
|
|
43
|
-
- Seamless fallback to S3 when cached data is stale
|
|
44
|
-
|
|
45
|
-
Note:
|
|
46
|
-
Unlike native Python dictionaries, insertion order is not preserved.
|
|
47
|
-
Operations may incur S3 API costs and network latency, though caching
|
|
48
|
-
significantly reduces this overhead for repeated access patterns.
|
|
49
|
-
"""
|
|
50
|
-
region: str
|
|
51
|
-
bucket_name: str
|
|
52
|
-
root_prefix: str
|
|
53
|
-
_base_dir: str
|
|
54
|
-
|
|
55
|
-
def __init__(self, bucket_name: str = "my_bucket",
|
|
56
|
-
region: str = None,
|
|
57
|
-
root_prefix: str = "",
|
|
58
|
-
base_dir: str = S3DICT_DEFAULT_BASE_DIR,
|
|
59
|
-
serialization_format: str = "pkl",
|
|
60
|
-
digest_len: int = 8,
|
|
61
|
-
append_only: bool = False,
|
|
62
|
-
base_class_for_values: Optional[type] = None,
|
|
63
|
-
*args, **kwargs):
|
|
64
|
-
"""Initialize an S3-backed persistent dictionary.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
bucket_name: Name of the S3 bucket to use. The bucket will be
|
|
68
|
-
created automatically if it does not exist and permissions allow.
|
|
69
|
-
region: AWS region for the bucket. If None, uses the default
|
|
70
|
-
client region from AWS configuration.
|
|
71
|
-
root_prefix: Common S3 key prefix under which all objects are
|
|
72
|
-
stored. A trailing slash is automatically added if missing.
|
|
73
|
-
base_dir: Local directory path used for temporary files and
|
|
74
|
-
local caching of S3 objects.
|
|
75
|
-
serialization_format: File extension/format for stored values. Supported formats:
|
|
76
|
-
'pkl' (pickle), 'json' (jsonpickle), or custom text formats.
|
|
77
|
-
append_only: If True, prevents modification of existing items
|
|
78
|
-
after they are initially stored.
|
|
79
|
-
digest_len: Number of base32 MD5 hash characters appended to key
|
|
80
|
-
elements to prevent case-insensitive filename collisions.
|
|
81
|
-
Set to 0 to disable collision prevention.
|
|
82
|
-
base_class_for_values: Optional base class that all stored values
|
|
83
|
-
must inherit from. When specified (and not str), serialization_format
|
|
84
|
-
must be 'pkl' or 'json' for proper serialization.
|
|
85
|
-
*args: Additional positional arguments (ignored, reserved for compatibility).
|
|
86
|
-
**kwargs: Additional keyword arguments (ignored, reserved for compatibility).
|
|
87
|
-
|
|
88
|
-
Note:
|
|
89
|
-
The S3 bucket will be created if it doesn't exist and AWS permissions
|
|
90
|
-
allow. Network connectivity and valid AWS credentials are required.
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
super().__init__(append_only=append_only,
|
|
94
|
-
base_class_for_values=base_class_for_values,
|
|
95
|
-
serialization_format=serialization_format)
|
|
96
|
-
individual_subdicts_params = {self.serialization_format: {}}
|
|
97
|
-
|
|
98
|
-
if not append_only:
|
|
99
|
-
self.etag_serialization_format = f"{self.serialization_format}_etag"
|
|
100
|
-
individual_subdicts_params[self.etag_serialization_format] = {
|
|
101
|
-
"base_class_for_values": str}
|
|
102
|
-
|
|
103
|
-
self.local_cache = OverlappingMultiDict(
|
|
104
|
-
dict_type=FileDirDict,
|
|
105
|
-
shared_subdicts_params={
|
|
106
|
-
"base_dir": base_dir,
|
|
107
|
-
"append_only": append_only,
|
|
108
|
-
"base_class_for_values": base_class_for_values,
|
|
109
|
-
"digest_len": digest_len
|
|
110
|
-
},
|
|
111
|
-
**individual_subdicts_params)
|
|
112
|
-
|
|
113
|
-
self.main_cache = getattr(self.local_cache, self.serialization_format)
|
|
114
|
-
if not self.append_only:
|
|
115
|
-
self.etag_cache = getattr(self.local_cache, self.etag_serialization_format)
|
|
116
|
-
|
|
117
|
-
self.region = region
|
|
118
|
-
if region is None:
|
|
119
|
-
self.s3_client = boto3.client('s3')
|
|
120
|
-
else:
|
|
121
|
-
self.s3_client = boto3.client('s3', region_name=region)
|
|
122
|
-
|
|
123
|
-
try:
|
|
124
|
-
self.s3_client.head_bucket(Bucket=bucket_name)
|
|
125
|
-
except ClientError as e:
|
|
126
|
-
error_code = e.response['Error']['Code']
|
|
127
|
-
if error_code == '404' or error_code == 'NotFound':
|
|
128
|
-
# Bucket does not exist, attempt to create it
|
|
129
|
-
try:
|
|
130
|
-
self.s3_client.create_bucket(Bucket=bucket_name)
|
|
131
|
-
except ClientError as create_e:
|
|
132
|
-
create_error_code = create_e.response['Error']['Code']
|
|
133
|
-
# Handle race condition where bucket was created by another process
|
|
134
|
-
# or the bucket name is already taken by another AWS account
|
|
135
|
-
if ( create_error_code == 'BucketAlreadyOwnedByYou'
|
|
136
|
-
or create_error_code == 'BucketAlreadyExists'):
|
|
137
|
-
pass
|
|
138
|
-
else:
|
|
139
|
-
raise create_e # Re-raise other unexpected creation errors
|
|
140
|
-
elif error_code == '403' or error_code == 'Forbidden':
|
|
141
|
-
# Bucket exists but access is forbidden - likely a cross-account
|
|
142
|
-
# bucket with policy granting limited access. Operations may still
|
|
143
|
-
# work if the policy allows the required S3 permissions.
|
|
144
|
-
pass
|
|
145
|
-
else:
|
|
146
|
-
raise e # Re-raise other unexpected head_bucket errors
|
|
147
|
-
|
|
148
|
-
self.bucket_name = bucket_name
|
|
149
|
-
|
|
150
|
-
self.root_prefix = root_prefix
|
|
151
|
-
if len(self.root_prefix) and self.root_prefix[-1] != "/":
|
|
152
|
-
self.root_prefix += "/"
|
|
153
|
-
|
|
154
|
-
@property
|
|
155
|
-
def digest_len(self) -> int:
|
|
156
|
-
return self.main_cache.digest_len
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def get_params(self):
|
|
160
|
-
"""Return configuration parameters as a dictionary.
|
|
161
|
-
|
|
162
|
-
This method supports the Parameterizable API and is not part of
|
|
163
|
-
the standard Python dictionary interface.
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
dict: A mapping of parameter names to their configured values,
|
|
167
|
-
including S3-specific parameters (region, bucket_name, root_prefix)
|
|
168
|
-
combined with parameters from the local cache, sorted by key names.
|
|
169
|
-
"""
|
|
170
|
-
params = self.main_cache.get_params()
|
|
171
|
-
params["region"] = self.region
|
|
172
|
-
params["bucket_name"] = self.bucket_name
|
|
173
|
-
params["root_prefix"] = self.root_prefix
|
|
174
|
-
sorted_params = sort_dict_by_keys(params)
|
|
175
|
-
return sorted_params
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
@property
|
|
179
|
-
def base_url(self):
|
|
180
|
-
"""Return the S3 URL prefix of this dictionary.
|
|
181
|
-
|
|
182
|
-
This property is not part of the standard Python dictionary interface.
|
|
183
|
-
|
|
184
|
-
Returns:
|
|
185
|
-
str: The base S3 URL in the format "s3://<bucket>/<root_prefix>".
|
|
186
|
-
"""
|
|
187
|
-
return f"s3://{self.bucket_name}/{self.root_prefix}"
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
@property
|
|
191
|
-
def base_dir(self) -> str:
|
|
192
|
-
"""Return the dictionary's base directory in the local filesystem.
|
|
193
|
-
|
|
194
|
-
This property is not part of the standard Python dictionary interface.
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
str: Path to the local cache directory used for temporary files
|
|
198
|
-
and caching S3 objects.
|
|
199
|
-
"""
|
|
200
|
-
return self.main_cache.base_dir
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
def _build_full_objectname(self, key: NonEmptyPersiDictKey) -> str:
|
|
204
|
-
"""Convert a key into a full S3 object key.
|
|
205
|
-
|
|
206
|
-
Args:
|
|
207
|
-
key: Dictionary key (string or sequence of strings) or SafeStrTuple.
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
str: The complete S3 object key including root_prefix and serialization_format
|
|
211
|
-
extension, with digest-based collision prevention applied if enabled.
|
|
212
|
-
"""
|
|
213
|
-
key = NonEmptySafeStrTuple(key)
|
|
214
|
-
key = sign_safe_str_tuple(key, 0)
|
|
215
|
-
objectname = self.root_prefix + "/".join(key) + "." + self.serialization_format
|
|
216
|
-
return objectname
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def __contains__(self, key: NonEmptyPersiDictKey) -> bool:
|
|
220
|
-
"""Check if the specified key exists in the dictionary.
|
|
221
|
-
|
|
222
|
-
For immutable dictionaries, checks the local cache first. Otherwise,
|
|
223
|
-
performs a HEAD request to S3 to verify object existence.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
key: Dictionary key (string or sequence of strings) or SafeStrTuple.
|
|
227
|
-
|
|
228
|
-
Returns:
|
|
229
|
-
bool: True if the key exists in S3 (or local cache for immutable
|
|
230
|
-
items), False otherwise.
|
|
231
|
-
"""
|
|
232
|
-
key = NonEmptySafeStrTuple(key)
|
|
233
|
-
if self.append_only and key in self.main_cache:
|
|
234
|
-
return True
|
|
235
|
-
try:
|
|
236
|
-
obj_name = self._build_full_objectname(key)
|
|
237
|
-
self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
238
|
-
return True
|
|
239
|
-
except ClientError as e:
|
|
240
|
-
if e.response['ResponseMetadata']['HTTPStatusCode'] == 404:
|
|
241
|
-
self.main_cache.discard(key)
|
|
242
|
-
self.etag_cache.discard(key)
|
|
243
|
-
return False
|
|
244
|
-
else:
|
|
245
|
-
raise
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def __getitem__(self, key: NonEmptyPersiDictKey) -> Any:
|
|
249
|
-
"""Retrieve the value stored for a key.
|
|
250
|
-
|
|
251
|
-
For immutable dictionaries with cached values, returns the cached copy.
|
|
252
|
-
Otherwise, fetches from S3 using conditional requests (ETags) when
|
|
253
|
-
available to minimize unnecessary downloads.
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
key: Dictionary key (string or sequence of strings) or SafeStrTuple.
|
|
257
|
-
|
|
258
|
-
Returns:
|
|
259
|
-
Any: The deserialized value stored for the key.
|
|
260
|
-
|
|
261
|
-
Raises:
|
|
262
|
-
KeyError: If the key does not exist in S3.
|
|
263
|
-
"""
|
|
264
|
-
|
|
265
|
-
key = NonEmptySafeStrTuple(key)
|
|
266
|
-
|
|
267
|
-
if self.append_only and key in self.main_cache:
|
|
268
|
-
return self.main_cache[key]
|
|
269
|
-
|
|
270
|
-
obj_name = self._build_full_objectname(key)
|
|
271
|
-
|
|
272
|
-
cached_etag = None
|
|
273
|
-
if not self.append_only and key in self.main_cache and key in self.etag_cache:
|
|
274
|
-
cached_etag = self.etag_cache[key]
|
|
275
|
-
|
|
276
|
-
try:
|
|
277
|
-
get_kwargs = {'Bucket': self.bucket_name, 'Key': obj_name}
|
|
278
|
-
if cached_etag:
|
|
279
|
-
get_kwargs['IfNoneMatch'] = cached_etag
|
|
280
|
-
|
|
281
|
-
response = self.s3_client.get_object(**get_kwargs)
|
|
282
|
-
|
|
283
|
-
# 200 OK: object was downloaded, either because it's new or changed.
|
|
284
|
-
body = response['Body']
|
|
285
|
-
|
|
286
|
-
# Deserialize and cache the S3 object content
|
|
287
|
-
if self.serialization_format == 'json':
|
|
288
|
-
deserialized_value = jsonpickle.loads(body.read().decode('utf-8'))
|
|
289
|
-
elif self.serialization_format == 'pkl':
|
|
290
|
-
deserialized_value = joblib.load(body)
|
|
291
|
-
else:
|
|
292
|
-
deserialized_value = body.read().decode('utf-8')
|
|
293
|
-
|
|
294
|
-
self.main_cache[key] = deserialized_value
|
|
295
|
-
|
|
296
|
-
if not self.append_only:
|
|
297
|
-
# Cache the S3 ETag for future conditional requests
|
|
298
|
-
s3_etag = response.get("ETag")
|
|
299
|
-
self.etag_cache[key] = s3_etag
|
|
300
|
-
|
|
301
|
-
except ClientError as e:
|
|
302
|
-
if e.response['ResponseMetadata']['HTTPStatusCode'] == 304:
|
|
303
|
-
# HTTP 304 Not Modified: cached version is current, no download needed
|
|
304
|
-
pass
|
|
305
|
-
elif e.response.get("Error", {}).get("Code") == 'NoSuchKey':
|
|
306
|
-
raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
|
|
307
|
-
else:
|
|
308
|
-
# Re-raise other client errors (permissions, throttling, etc.)
|
|
309
|
-
raise
|
|
310
|
-
|
|
311
|
-
return self.main_cache[key]
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def __setitem__(self, key: NonEmptyPersiDictKey, value: Any):
|
|
315
|
-
"""Store a value for a key in both S3 and local cache.
|
|
316
|
-
|
|
317
|
-
Handles special joker values (KEEP_CURRENT, DELETE_CURRENT) for
|
|
318
|
-
conditional operations. Validates value types against base_class_for_values
|
|
319
|
-
if specified, then stores locally and uploads to S3. Attempts to cache
|
|
320
|
-
the S3 ETag for efficient future retrievals.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
key: Dictionary key (string or sequence of strings) or NonEmptyPersiDictKey.
|
|
324
|
-
value: Value to store, or a joker command (KEEP_CURRENT or
|
|
325
|
-
DELETE_CURRENT).
|
|
326
|
-
|
|
327
|
-
Raises:
|
|
328
|
-
KeyError: If attempting to modify an existing item when
|
|
329
|
-
append_only is True.
|
|
330
|
-
TypeError: If value is a PersiDict instance or does not match
|
|
331
|
-
the required base_class_for_values when specified.
|
|
332
|
-
"""
|
|
333
|
-
|
|
334
|
-
key = NonEmptySafeStrTuple(key)
|
|
335
|
-
if self._process_setitem_args(key, value) is EXECUTION_IS_COMPLETE:
|
|
336
|
-
return None
|
|
337
|
-
|
|
338
|
-
obj_name = self._build_full_objectname(key)
|
|
339
|
-
|
|
340
|
-
# Store in local cache first
|
|
341
|
-
self.main_cache[key] = value
|
|
342
|
-
|
|
343
|
-
# Upload the serialized file from local cache to S3
|
|
344
|
-
file_path = self.main_cache._build_full_path(key)
|
|
345
|
-
self.s3_client.upload_file(file_path, self.bucket_name, obj_name)
|
|
346
|
-
|
|
347
|
-
if self.append_only:
|
|
348
|
-
# For immutable items, the local cache is authoritative; no need to
|
|
349
|
-
# verify ETag from S3 as the item cannot change after initial upload
|
|
350
|
-
return
|
|
351
|
-
|
|
352
|
-
try:
|
|
353
|
-
# Cache the S3 ETag for efficient conditional requests on future reads
|
|
354
|
-
head = self.s3_client.head_object(
|
|
355
|
-
Bucket=self.bucket_name, Key=obj_name)
|
|
356
|
-
self.etag_cache[key] = head.get("ETag")
|
|
357
|
-
except ClientError:
|
|
358
|
-
# Remove stale ETag on failure to force fresh downloads later
|
|
359
|
-
self.etag_cache.discard(key)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
def __delitem__(self, key: NonEmptyPersiDictKey):
|
|
363
|
-
"""Delete the stored value for a key from both S3 and local cache.
|
|
364
|
-
|
|
365
|
-
Args:
|
|
366
|
-
key: Dictionary key (string or sequence of strings)
|
|
367
|
-
or NonEmptyPersiDictKey.
|
|
368
|
-
|
|
369
|
-
Raises:
|
|
370
|
-
KeyError: If append_only is True, or if the key does not exist.
|
|
371
|
-
"""
|
|
372
|
-
key = NonEmptySafeStrTuple(key)
|
|
373
|
-
self._process_delitem_args(key)
|
|
374
|
-
obj_name = self._build_full_objectname(key)
|
|
375
|
-
self.s3_client.delete_object(Bucket=self.bucket_name, Key=obj_name)
|
|
376
|
-
self.etag_cache.discard(key)
|
|
377
|
-
self.main_cache.discard(key)
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def __len__(self) -> int:
|
|
381
|
-
"""Return the number of key-value pairs in the dictionary.
|
|
382
|
-
|
|
383
|
-
Warning:
|
|
384
|
-
This operation can be very slow and expensive on large S3 buckets
|
|
385
|
-
as it must paginate through all objects under the dictionary's prefix.
|
|
386
|
-
Avoid using in performance-critical code.
|
|
387
|
-
|
|
388
|
-
Returns:
|
|
389
|
-
int: Number of stored items under this dictionary's root_prefix.
|
|
390
|
-
"""
|
|
391
|
-
|
|
392
|
-
num_files = 0
|
|
393
|
-
suffix = "." + self.serialization_format
|
|
394
|
-
|
|
395
|
-
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
396
|
-
page_iterator = paginator.paginate(
|
|
397
|
-
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
398
|
-
|
|
399
|
-
for page in page_iterator:
|
|
400
|
-
contents = page.get("Contents")
|
|
401
|
-
if not contents:
|
|
402
|
-
continue
|
|
403
|
-
for key in contents:
|
|
404
|
-
obj_name = key["Key"]
|
|
405
|
-
if obj_name.endswith(suffix):
|
|
406
|
-
num_files += 1
|
|
407
|
-
|
|
408
|
-
return num_files
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
def _generic_iter(self, result_type: set[str]):
|
|
412
|
-
"""Underlying implementation for items(), keys(), and values() iterators.
|
|
413
|
-
|
|
414
|
-
Paginates through S3 objects under the configured root_prefix and yields
|
|
415
|
-
keys, values, and/or timestamps according to the requested result_type.
|
|
416
|
-
S3 object keys are converted to SafeStrTuple instances by removing the
|
|
417
|
-
file extension and reversing digest-based signing if enabled.
|
|
418
|
-
|
|
419
|
-
Args:
|
|
420
|
-
result_type: Non-empty subset of {"keys", "values", "timestamps"}
|
|
421
|
-
specifying which fields to yield from each dictionary entry.
|
|
422
|
-
|
|
423
|
-
Returns:
|
|
424
|
-
Iterator: A generator that yields:
|
|
425
|
-
- SafeStrTuple if result_type == {"keys"}
|
|
426
|
-
- Any if result_type == {"values"}
|
|
427
|
-
- tuple[SafeStrTuple, Any] if result_type == {"keys", "values"}
|
|
428
|
-
- tuple including float timestamp if "timestamps" requested
|
|
429
|
-
|
|
430
|
-
Raises:
|
|
431
|
-
ValueError: If result_type is invalid (empty, not a set, or contains
|
|
432
|
-
unsupported field names).
|
|
433
|
-
"""
|
|
434
|
-
|
|
435
|
-
self._process_generic_iter_args(result_type)
|
|
436
|
-
|
|
437
|
-
suffix = "." + self.serialization_format
|
|
438
|
-
ext_len = len(self.serialization_format) + 1
|
|
439
|
-
prefix_len = len(self.root_prefix)
|
|
440
|
-
|
|
441
|
-
def splitter(full_name: str) -> SafeStrTuple:
|
|
442
|
-
"""Convert an S3 object key into a SafeStrTuple without the file extension.
|
|
443
|
-
|
|
444
|
-
Args:
|
|
445
|
-
full_name: Complete S3 object key including root_prefix and extension.
|
|
446
|
-
|
|
447
|
-
Returns:
|
|
448
|
-
SafeStrTuple: The parsed key components with digest signatures intact.
|
|
449
|
-
|
|
450
|
-
Raises:
|
|
451
|
-
ValueError: If the object key does not start with this dictionary's
|
|
452
|
-
root_prefix (indicating it's outside the dictionary's scope).
|
|
453
|
-
"""
|
|
454
|
-
if not full_name.startswith(self.root_prefix):
|
|
455
|
-
raise ValueError(
|
|
456
|
-
f"S3 object key '{full_name}' is outside of root_prefix '{self.root_prefix}'"
|
|
457
|
-
)
|
|
458
|
-
result = full_name[prefix_len:-ext_len].split(sep="/")
|
|
459
|
-
return SafeStrTuple(result)
|
|
460
|
-
|
|
461
|
-
def step():
|
|
462
|
-
"""Generator that paginates through S3 objects and yields requested data.
|
|
463
|
-
|
|
464
|
-
Yields dictionary entries (keys, values, timestamps) according to the
|
|
465
|
-
result_type specification from the parent _generic_iter method.
|
|
466
|
-
"""
|
|
467
|
-
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
468
|
-
page_iterator = paginator.paginate(
|
|
469
|
-
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
470
|
-
|
|
471
|
-
for page in page_iterator:
|
|
472
|
-
contents = page.get("Contents")
|
|
473
|
-
if not contents:
|
|
474
|
-
continue
|
|
475
|
-
for key in contents:
|
|
476
|
-
obj_name = key["Key"]
|
|
477
|
-
if not obj_name.endswith(suffix):
|
|
478
|
-
continue
|
|
479
|
-
obj_key = splitter(obj_name)
|
|
480
|
-
|
|
481
|
-
to_return = []
|
|
482
|
-
|
|
483
|
-
if "keys" in result_type:
|
|
484
|
-
key_to_return = unsign_safe_str_tuple(
|
|
485
|
-
obj_key, 0)
|
|
486
|
-
to_return.append(key_to_return)
|
|
487
|
-
|
|
488
|
-
if "values" in result_type:
|
|
489
|
-
value_to_return = self[obj_key]
|
|
490
|
-
to_return.append(value_to_return)
|
|
491
|
-
|
|
492
|
-
if len(result_type) == 1:
|
|
493
|
-
yield to_return[0]
|
|
494
|
-
else:
|
|
495
|
-
if "timestamps" in result_type:
|
|
496
|
-
timestamp_to_return = key["LastModified"].timestamp()
|
|
497
|
-
to_return.append(timestamp_to_return)
|
|
498
|
-
yield tuple(to_return)
|
|
499
|
-
|
|
500
|
-
return step()
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
def get_subdict(self, key: PersiDictKey) -> S3Dict_Legacy:
|
|
504
|
-
"""Create a subdictionary scoped to items with the specified prefix.
|
|
505
|
-
|
|
506
|
-
Returns an empty subdictionary if no items exist under the prefix.
|
|
507
|
-
If the prefix is empty, the entire dictionary is returned.
|
|
508
|
-
This method is not part of the standard Python dictionary interface.
|
|
509
|
-
|
|
510
|
-
Args:
|
|
511
|
-
key (PersiDictKey): A common prefix (string or sequence of strings)
|
|
512
|
-
used to scope items stored under this dictionary.
|
|
513
|
-
|
|
514
|
-
Returns:
|
|
515
|
-
S3Dict_Legacy: A new S3Dict instance with root_prefix extended by the given
|
|
516
|
-
key, sharing the parent's bucket, region, serialization_format, and other
|
|
517
|
-
configuration settings.
|
|
518
|
-
"""
|
|
519
|
-
|
|
520
|
-
key = SafeStrTuple(key)
|
|
521
|
-
if len(key):
|
|
522
|
-
key = sign_safe_str_tuple(key, 0)
|
|
523
|
-
full_root_prefix = self.root_prefix + "/".join(key)
|
|
524
|
-
else:
|
|
525
|
-
full_root_prefix = self.root_prefix
|
|
526
|
-
|
|
527
|
-
new_dir_path = self.main_cache._build_full_path(
|
|
528
|
-
key, create_subdirs=True, is_file_path=False)
|
|
529
|
-
|
|
530
|
-
new_dict = S3Dict_Legacy(
|
|
531
|
-
bucket_name=self.bucket_name,
|
|
532
|
-
region=self.region,
|
|
533
|
-
root_prefix=full_root_prefix,
|
|
534
|
-
base_dir=new_dir_path,
|
|
535
|
-
serialization_format=self.serialization_format,
|
|
536
|
-
append_only=self.append_only,
|
|
537
|
-
base_class_for_values=self.base_class_for_values)
|
|
538
|
-
|
|
539
|
-
return new_dict
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
def timestamp(self, key: NonEmptyPersiDictKey) -> float:
|
|
543
|
-
"""Get the last modification timestamp for a key.
|
|
544
|
-
|
|
545
|
-
This method is not part of the standard Python dictionary interface.
|
|
546
|
-
|
|
547
|
-
Args:
|
|
548
|
-
key: Dictionary key (string or sequence of strings) or SafeStrTuple.
|
|
549
|
-
|
|
550
|
-
Returns:
|
|
551
|
-
float: POSIX timestamp (seconds since Unix epoch) of the last
|
|
552
|
-
modification time as reported by S3. The timestamp is timezone-aware
|
|
553
|
-
and converted to UTC.
|
|
554
|
-
|
|
555
|
-
Raises:
|
|
556
|
-
KeyError: If the key does not exist in S3.
|
|
557
|
-
"""
|
|
558
|
-
key = NonEmptySafeStrTuple(key)
|
|
559
|
-
obj_name = self._build_full_objectname(key)
|
|
560
|
-
response = self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
561
|
-
return response["LastModified"].timestamp()
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
# parameterizable.register_parameterizable_class(S3Dict_Legacy)
|
|
565
|
-
|
|
File without changes
|