persidict 0.37.2__py3-none-any.whl → 0.103.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of persidict might be problematic. Click here for more details.
- persidict/__init__.py +41 -24
- persidict/basic_s3_dict.py +595 -0
- persidict/cached_appendonly_dict.py +247 -0
- persidict/cached_mutable_dict.py +248 -0
- persidict/empty_dict.py +171 -0
- persidict/file_dir_dict.py +130 -122
- persidict/local_dict.py +502 -0
- persidict/overlapping_multi_dict.py +23 -15
- persidict/persi_dict.py +281 -148
- persidict/s3_dict_file_dir_cached.py +215 -0
- persidict/{s3_dict.py → s3_dict_legacy.py} +111 -90
- persidict/safe_chars.py +13 -0
- persidict/safe_str_tuple.py +28 -6
- persidict/singletons.py +232 -0
- persidict/write_once_dict.py +47 -30
- {persidict-0.37.2.dist-info → persidict-0.103.0.dist-info}/METADATA +35 -25
- persidict-0.103.0.dist-info/RECORD +19 -0
- {persidict-0.37.2.dist-info → persidict-0.103.0.dist-info}/WHEEL +1 -1
- persidict/.DS_Store +0 -0
- persidict/jokers.py +0 -99
- persidict-0.37.2.dist-info/RECORD +0 -14
persidict/__init__.py
CHANGED
|
@@ -1,33 +1,50 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Persistent dictionaries that store key-value pairs on local disks or AWS S3.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This package provides a unified interface for persistent dictionary-like
|
|
4
|
+
storage with various backends including filesystem and AWS S3.
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
for all
|
|
6
|
+
Classes:
|
|
7
|
+
PersiDict: Abstract base class defining the unified interface for all
|
|
8
|
+
persistent dictionaries.
|
|
9
|
+
NonEmptySafeStrTuple: A flat tuple of URL/filename-safe strings that
|
|
10
|
+
can be used as a key for PersiDict objects.
|
|
11
|
+
FileDirDict: A dictionary that stores key-value pairs as files on a
|
|
12
|
+
local hard drive. Keys compose filenames, values are stored as
|
|
13
|
+
pickle or JSON objects.
|
|
14
|
+
S3Dict_Legacy: A dictionary that stores key-value pairs as S3 objects on AWS.
|
|
15
|
+
Keys compose object names, values are stored as pickle or JSON S3 objects.
|
|
16
|
+
BasicS3Dict: A basic S3-backed dictionary with direct S3 operations.
|
|
17
|
+
WriteOnceDict: A write-once wrapper that prevents modification of existing
|
|
18
|
+
items after initial storage.
|
|
19
|
+
EmptyDict: Equivalent of null device in OS - accepts all writes but discards
|
|
20
|
+
them, returns nothing on reads. Always appears empty regardless of
|
|
21
|
+
operations performed. Useful for testing, debugging, or as a placeholder.
|
|
22
|
+
OverlappingMultiDict: A dictionary that can handle overlapping key spaces.
|
|
7
23
|
|
|
8
|
-
|
|
9
|
-
|
|
24
|
+
Functions:
|
|
25
|
+
get_safe_chars(): Returns a set of URL/filename-safe characters permitted
|
|
26
|
+
in keys.
|
|
27
|
+
replace_unsafe_chars(): Replaces forbidden characters in a string with
|
|
28
|
+
safe alternatives.
|
|
10
29
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
A key is used to compose a filename, while a value is stored
|
|
14
|
-
as a pickle or a json object in the file.
|
|
30
|
+
Constants:
|
|
31
|
+
KEEP_CURRENT, DELETE_CURRENT: Special joker values for conditional operations.
|
|
15
32
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
as a pickle or a json S3 object.
|
|
20
|
-
|
|
21
|
-
The package also offers two helper functions: get_safe_chars(),
|
|
22
|
-
which returns a set of URL/filename-safe characters permitted in keys,
|
|
23
|
-
and replace_unsafe_chars(), which replaces forbidden characters in a string.
|
|
33
|
+
Note:
|
|
34
|
+
All persistent dictionaries support multiple serialization formats, including
|
|
35
|
+
pickle and JSON, with automatic type handling and collision-safe key encoding.
|
|
24
36
|
"""
|
|
25
|
-
from .safe_chars import
|
|
26
|
-
from .safe_str_tuple import
|
|
37
|
+
from .safe_chars import *
|
|
38
|
+
from .safe_str_tuple import *
|
|
27
39
|
from .persi_dict import PersiDict, PersiDictKey
|
|
28
40
|
from .file_dir_dict import FileDirDict
|
|
29
|
-
from .
|
|
41
|
+
from .s3_dict_file_dir_cached import S3Dict_FileDirCached, S3Dict
|
|
42
|
+
from .basic_s3_dict import BasicS3Dict
|
|
30
43
|
from .write_once_dict import WriteOnceDict
|
|
31
|
-
from .
|
|
32
|
-
from .
|
|
33
|
-
from .
|
|
44
|
+
from .empty_dict import EmptyDict
|
|
45
|
+
from .singletons import Joker, KeepCurrentFlag, DeleteCurrentFlag
|
|
46
|
+
from .singletons import KEEP_CURRENT, DELETE_CURRENT
|
|
47
|
+
from .overlapping_multi_dict import OverlappingMultiDict
|
|
48
|
+
from .cached_appendonly_dict import AppendOnlyDictCached
|
|
49
|
+
from .cached_mutable_dict import MutableDictCached
|
|
50
|
+
from .local_dict import LocalDict
|
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
import io
|
|
5
|
+
|
|
6
|
+
import boto3
|
|
7
|
+
import joblib
|
|
8
|
+
import jsonpickle
|
|
9
|
+
from botocore.exceptions import ClientError
|
|
10
|
+
|
|
11
|
+
import parameterizable
|
|
12
|
+
from parameterizable.dict_sorter import sort_dict_by_keys
|
|
13
|
+
|
|
14
|
+
from .safe_str_tuple import SafeStrTuple, NonEmptySafeStrTuple
|
|
15
|
+
from .safe_str_tuple_signing import sign_safe_str_tuple, unsign_safe_str_tuple
|
|
16
|
+
from .persi_dict import PersiDict, NonEmptyPersiDictKey, PersiDictKey
|
|
17
|
+
from .singletons import (EXECUTION_IS_COMPLETE, ETagHasNotChangedFlag,
|
|
18
|
+
ETAG_HAS_NOT_CHANGED)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def not_found_error(e:ClientError) -> bool:
|
|
22
|
+
"""Helper function to check if a ClientError indicates a missing S3 object.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
e: The ClientError exception to check.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
bool: True if the error indicates a missing object (404, NoSuchKey),
|
|
29
|
+
False otherwise.
|
|
30
|
+
"""
|
|
31
|
+
status = e.response['ResponseMetadata']['HTTPStatusCode']
|
|
32
|
+
if status == 404:
|
|
33
|
+
return True
|
|
34
|
+
else:
|
|
35
|
+
error_code = e.response['Error']['Code']
|
|
36
|
+
return error_code in ('NoSuchKey', '404', 'NotFound')
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BasicS3Dict(PersiDict):
|
|
40
|
+
"""A persistent dictionary that stores key-value pairs as S3 objects.
|
|
41
|
+
|
|
42
|
+
Each key-value pair is stored as a separate S3 object in the specified bucket.
|
|
43
|
+
|
|
44
|
+
A key can be either a string (object name without file extension) or a sequence
|
|
45
|
+
of strings representing a hierarchical path (folder structure ending with an
|
|
46
|
+
object name). Values can be instances of any Python type and are serialized
|
|
47
|
+
to S3 objects.
|
|
48
|
+
|
|
49
|
+
BasicS3Dict supports multiple serialization formats:
|
|
50
|
+
- Binary storage using pickle ('pkl' format)
|
|
51
|
+
- Human-readable text using jsonpickle ('json' format)
|
|
52
|
+
- Plain text for string values (other formats)
|
|
53
|
+
|
|
54
|
+
Note:
|
|
55
|
+
Unlike native Python dictionaries, insertion order is not preserved.
|
|
56
|
+
Operations may incur S3 API costs and network latency.
|
|
57
|
+
All operations are performed directly against S3 without local caching.
|
|
58
|
+
"""
|
|
59
|
+
region: str
|
|
60
|
+
bucket_name: str
|
|
61
|
+
root_prefix: str
|
|
62
|
+
|
|
63
|
+
def __init__(self, bucket_name: str = "my_bucket",
|
|
64
|
+
region: str = None,
|
|
65
|
+
root_prefix: str = "",
|
|
66
|
+
serialization_format: str = "pkl",
|
|
67
|
+
append_only: bool = False,
|
|
68
|
+
base_class_for_values: Optional[type] = None,
|
|
69
|
+
*args, **kwargs):
|
|
70
|
+
"""Initialize a basic S3-backed persistent dictionary.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
bucket_name: Name of the S3 bucket to use. The bucket will be
|
|
74
|
+
created automatically if it does not exist and permissions allow.
|
|
75
|
+
region: AWS region for the bucket. If None, uses the default
|
|
76
|
+
client region from AWS configuration.
|
|
77
|
+
root_prefix: Common S3 key prefix under which all objects are
|
|
78
|
+
stored. A trailing slash is automatically added if missing.
|
|
79
|
+
serialization_format: File extension/format for stored values. Supported formats:
|
|
80
|
+
'pkl' (pickle), 'json' (jsonpickle), or custom text formats.
|
|
81
|
+
append_only: If True, prevents modification of existing items
|
|
82
|
+
after they are initially stored.
|
|
83
|
+
base_class_for_values: Optional base class that all stored values
|
|
84
|
+
must inherit from. When specified (and not str), serialization_format
|
|
85
|
+
must be 'pkl' or 'json' for proper serialization.
|
|
86
|
+
*args: Additional positional arguments (ignored, reserved for compatibility).
|
|
87
|
+
**kwargs: Additional keyword arguments (ignored, reserved for compatibility).
|
|
88
|
+
|
|
89
|
+
Note:
|
|
90
|
+
The S3 bucket will be created if it doesn't exist and AWS permissions
|
|
91
|
+
allow. Network connectivity and valid AWS credentials are required.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
super().__init__(append_only=append_only,
|
|
95
|
+
base_class_for_values=base_class_for_values,
|
|
96
|
+
serialization_format=serialization_format)
|
|
97
|
+
|
|
98
|
+
self.region = region
|
|
99
|
+
if region is None:
|
|
100
|
+
self.s3_client = boto3.client('s3')
|
|
101
|
+
else:
|
|
102
|
+
self.s3_client = boto3.client('s3', region_name=region)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
self.s3_client.head_bucket(Bucket=bucket_name)
|
|
106
|
+
except ClientError as e:
|
|
107
|
+
error_code = e.response['Error']['Code']
|
|
108
|
+
if not_found_error(e):
|
|
109
|
+
# Bucket does not exist, attempt to create it
|
|
110
|
+
try:
|
|
111
|
+
effective_region = self.s3_client.meta.region_name
|
|
112
|
+
if effective_region and effective_region != 'us-east-1':
|
|
113
|
+
self.s3_client.create_bucket(
|
|
114
|
+
Bucket=bucket_name,
|
|
115
|
+
CreateBucketConfiguration={'LocationConstraint': effective_region})
|
|
116
|
+
else:
|
|
117
|
+
self.s3_client.create_bucket(Bucket=bucket_name)
|
|
118
|
+
|
|
119
|
+
except ClientError as create_e:
|
|
120
|
+
create_error_code = create_e.response['Error']['Code']
|
|
121
|
+
# Handle race condition where the bucket was created by another
|
|
122
|
+
# process or its name is already taken by another AWS account
|
|
123
|
+
if ( create_error_code == 'BucketAlreadyOwnedByYou'
|
|
124
|
+
or create_error_code == 'BucketAlreadyExists'):
|
|
125
|
+
pass
|
|
126
|
+
else:
|
|
127
|
+
raise create_e # Re-raise other unexpected creation errors
|
|
128
|
+
elif error_code == '403' or error_code == 'Forbidden':
|
|
129
|
+
# Bucket exists but access is forbidden - likely a cross-account
|
|
130
|
+
# bucket with policy granting limited access. Operations may still
|
|
131
|
+
# work if the policy allows the required S3 permissions.
|
|
132
|
+
pass
|
|
133
|
+
else:
|
|
134
|
+
raise e # Re-raise other unexpected head_bucket errors
|
|
135
|
+
|
|
136
|
+
self.bucket_name = bucket_name
|
|
137
|
+
|
|
138
|
+
self.root_prefix = root_prefix
|
|
139
|
+
if len(self.root_prefix) and self.root_prefix[-1] != "/":
|
|
140
|
+
self.root_prefix += "/"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_params(self):
|
|
144
|
+
"""Return configuration parameters as a dictionary.
|
|
145
|
+
|
|
146
|
+
This method supports the Parameterizable API and is not part of
|
|
147
|
+
the standard Python dictionary interface.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
dict: A mapping of parameter names to their configured values,
|
|
151
|
+
including S3-specific parameters (region, bucket_name, root_prefix)
|
|
152
|
+
sorted by key names.
|
|
153
|
+
"""
|
|
154
|
+
params = {
|
|
155
|
+
"region": self.region,
|
|
156
|
+
"bucket_name": self.bucket_name,
|
|
157
|
+
"root_prefix": self.root_prefix,
|
|
158
|
+
"serialization_format": self.serialization_format,
|
|
159
|
+
"append_only": self.append_only,
|
|
160
|
+
"base_class_for_values": self.base_class_for_values,
|
|
161
|
+
}
|
|
162
|
+
sorted_params = sort_dict_by_keys(params)
|
|
163
|
+
return sorted_params
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def etag(self, key:NonEmptyPersiDictKey) -> str|None:
|
|
168
|
+
"""Get an ETag for a key."""
|
|
169
|
+
key = NonEmptySafeStrTuple(key)
|
|
170
|
+
obj_name = self._build_full_objectname(key)
|
|
171
|
+
try:
|
|
172
|
+
response = self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
173
|
+
return response["ETag"]
|
|
174
|
+
except ClientError as e:
|
|
175
|
+
if not_found_error(e):
|
|
176
|
+
raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
|
|
177
|
+
else:
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def base_url(self) -> str|None:
|
|
183
|
+
"""Return the S3 URL prefix of this dictionary.
|
|
184
|
+
|
|
185
|
+
This property is not part of the standard Python dictionary interface.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
str: The base S3 URL in the format "s3://<bucket>/<root_prefix>".
|
|
189
|
+
"""
|
|
190
|
+
return f"s3://{self.bucket_name}/{self.root_prefix}"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _build_full_objectname(self, key: NonEmptyPersiDictKey) -> str:
|
|
194
|
+
"""Convert a key into a full S3 object key.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
key: Dictionary key (string or sequence of strings
|
|
198
|
+
or NonEmptySafeStrTuple).
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
str: The complete S3 object key including root_prefix and serialization_format
|
|
202
|
+
extension, with digest-based collision prevention applied if enabled.
|
|
203
|
+
"""
|
|
204
|
+
key = NonEmptySafeStrTuple(key)
|
|
205
|
+
key = sign_safe_str_tuple(key, 0)
|
|
206
|
+
objectname = self.root_prefix + "/".join(key) + "." + self.serialization_format
|
|
207
|
+
return objectname
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def __contains__(self, key: NonEmptyPersiDictKey) -> bool:
|
|
211
|
+
"""Check if the specified key exists in the dictionary.
|
|
212
|
+
|
|
213
|
+
Performs a HEAD request to S3 to verify the object's existence.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
key: Dictionary key (string or sequence of strings
|
|
217
|
+
or NonEmptySafeStrTuple).
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
bool: True if the key exists in S3, False otherwise.
|
|
221
|
+
"""
|
|
222
|
+
key = NonEmptySafeStrTuple(key)
|
|
223
|
+
try:
|
|
224
|
+
obj_name = self._build_full_objectname(key)
|
|
225
|
+
self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
226
|
+
return True
|
|
227
|
+
except ClientError as e:
|
|
228
|
+
if not_found_error(e):
|
|
229
|
+
return False
|
|
230
|
+
else:
|
|
231
|
+
raise
|
|
232
|
+
|
|
233
|
+
def get_item_if_etag_changed(self, key: NonEmptyPersiDictKey, etag: str | None
|
|
234
|
+
) -> tuple[Any,str|None] | ETagHasNotChangedFlag:
|
|
235
|
+
"""Retrieve the value for a key only if its ETag has changed.
|
|
236
|
+
|
|
237
|
+
This method is absent in the original dict API.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
key: Dictionary key (string or sequence of strings
|
|
241
|
+
or NonEmptySafeStrTuple).
|
|
242
|
+
etag: The ETag value to compare against.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
tuple[Any, str|None] | ETagHasNotChangedFlag: The deserialized value
|
|
246
|
+
if the ETag has changed, along with the new ETag,
|
|
247
|
+
or ETAG_HAS_NOT_CHANGED if the etag matches the current one.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
KeyError: If the key does not exist in S3.
|
|
251
|
+
"""
|
|
252
|
+
key = NonEmptySafeStrTuple(key)
|
|
253
|
+
obj_name = self._build_full_objectname(key)
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
get_kwargs = {'Bucket': self.bucket_name, 'Key': obj_name}
|
|
257
|
+
if etag:
|
|
258
|
+
get_kwargs['IfNoneMatch'] = etag
|
|
259
|
+
|
|
260
|
+
response = self.s3_client.get_object(**get_kwargs)
|
|
261
|
+
|
|
262
|
+
# 200 OK: object was downloaded, either because it's new or changed.
|
|
263
|
+
body = response['Body']
|
|
264
|
+
s3_etag = response.get("ETag")
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
if self.serialization_format == 'json':
|
|
268
|
+
deserialized_value = jsonpickle.loads(body.read().decode('utf-8'))
|
|
269
|
+
elif self.serialization_format == 'pkl':
|
|
270
|
+
with io.BytesIO(body.read()) as buffer:
|
|
271
|
+
deserialized_value = joblib.load(buffer)
|
|
272
|
+
else:
|
|
273
|
+
deserialized_value = body.read().decode('utf-8')
|
|
274
|
+
finally:
|
|
275
|
+
body.close()
|
|
276
|
+
|
|
277
|
+
return (deserialized_value, s3_etag)
|
|
278
|
+
|
|
279
|
+
except ClientError as e:
|
|
280
|
+
if e.response['ResponseMetadata']['HTTPStatusCode'] == 304:
|
|
281
|
+
# HTTP 304 Not Modified: the version is current, no download needed
|
|
282
|
+
return ETAG_HAS_NOT_CHANGED
|
|
283
|
+
elif not_found_error(e):
|
|
284
|
+
raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
|
|
285
|
+
else:
|
|
286
|
+
raise
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def __getitem__(self, key: NonEmptyPersiDictKey) -> Any:
|
|
290
|
+
"""Retrieve the value stored for a key directly from S3.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
key: Dictionary key (string or sequence of strings
|
|
294
|
+
or NonEmptySafeStrTuple).
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Any: The deserialized value stored for the key.
|
|
298
|
+
|
|
299
|
+
Raises:
|
|
300
|
+
KeyError: If the key does not exist in S3.
|
|
301
|
+
"""
|
|
302
|
+
return self.get_item_if_etag_changed(key, None)[0]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def set_item_get_etag(self, key: NonEmptyPersiDictKey, value: Any) -> str|None:
|
|
306
|
+
"""Store a value for a key directly in S3 and return the new ETag.
|
|
307
|
+
|
|
308
|
+
Handles special joker values (KEEP_CURRENT, DELETE_CURRENT) for
|
|
309
|
+
conditional operations. Validates value types against base_class_for_values
|
|
310
|
+
if specified, then serializes and uploads directly to S3.
|
|
311
|
+
|
|
312
|
+
This method is absent in the original dict API.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
key: Dictionary key (string or sequence of strings)
|
|
316
|
+
or NonEmptySafeStrTuple.
|
|
317
|
+
value: Value to store, or a joker command (KEEP_CURRENT or
|
|
318
|
+
DELETE_CURRENT).
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
str|None: The ETag of the newly stored object, or None if a joker
|
|
322
|
+
command was processed without uploading a new object.
|
|
323
|
+
|
|
324
|
+
Raises:
|
|
325
|
+
KeyError: If attempting to modify an existing item when
|
|
326
|
+
append_only is True.
|
|
327
|
+
TypeError: If value is a PersiDict instance or does not match
|
|
328
|
+
the required base_class_for_values when specified.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
key = NonEmptySafeStrTuple(key)
|
|
332
|
+
if self._process_setitem_args(key, value) is EXECUTION_IS_COMPLETE:
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
obj_name = self._build_full_objectname(key)
|
|
336
|
+
|
|
337
|
+
# Serialize the value directly to S3
|
|
338
|
+
if self.serialization_format == 'json':
|
|
339
|
+
serialized_data = jsonpickle.dumps(value, indent=4).encode('utf-8')
|
|
340
|
+
content_type = 'application/json'
|
|
341
|
+
elif self.serialization_format == 'pkl':
|
|
342
|
+
with io.BytesIO() as buffer:
|
|
343
|
+
joblib.dump(value, buffer)
|
|
344
|
+
serialized_data = buffer.getvalue()
|
|
345
|
+
content_type = 'application/octet-stream'
|
|
346
|
+
else:
|
|
347
|
+
if isinstance(value, str):
|
|
348
|
+
serialized_data = value.encode('utf-8')
|
|
349
|
+
else:
|
|
350
|
+
serialized_data = str(value).encode('utf-8')
|
|
351
|
+
content_type = 'text/plain'
|
|
352
|
+
|
|
353
|
+
response = self.s3_client.put_object(
|
|
354
|
+
Bucket=self.bucket_name,
|
|
355
|
+
Key=obj_name,
|
|
356
|
+
Body=serialized_data,
|
|
357
|
+
ContentType=content_type
|
|
358
|
+
)
|
|
359
|
+
return response.get("ETag")
|
|
360
|
+
|
|
361
|
+
def __setitem__(self, key: NonEmptyPersiDictKey, value: Any):
|
|
362
|
+
"""Store a value for a key directly in S3.
|
|
363
|
+
|
|
364
|
+
Handles special joker values (KEEP_CURRENT, DELETE_CURRENT) for
|
|
365
|
+
conditional operations. Validates value types against base_class_for_values
|
|
366
|
+
if specified, then serializes and uploads directly to S3.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
key: Dictionary key (string or sequence of strings
|
|
370
|
+
or NonEmptyPersiDictKey).
|
|
371
|
+
value: Value to store, or a joker command (KEEP_CURRENT or
|
|
372
|
+
DELETE_CURRENT).
|
|
373
|
+
|
|
374
|
+
Raises:
|
|
375
|
+
KeyError: If attempting to modify an existing item when
|
|
376
|
+
append_only is True.
|
|
377
|
+
TypeError: If value is a PersiDict instance or does not match
|
|
378
|
+
the required base_class_for_values when specified.
|
|
379
|
+
"""
|
|
380
|
+
self.set_item_get_etag(key, value)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def __delitem__(self, key: NonEmptyPersiDictKey):
|
|
384
|
+
"""Delete the stored value for a key from S3.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
key: Dictionary key (string or sequence of strings
|
|
388
|
+
or NonEmptyPersiDictKey).
|
|
389
|
+
|
|
390
|
+
Raises:
|
|
391
|
+
KeyError: If append_only is True, or if the key does not exist.
|
|
392
|
+
"""
|
|
393
|
+
key = NonEmptySafeStrTuple(key)
|
|
394
|
+
self._process_delitem_args(key)
|
|
395
|
+
obj_name = self._build_full_objectname(key)
|
|
396
|
+
try:
|
|
397
|
+
self.s3_client.delete_object(Bucket=self.bucket_name, Key=obj_name)
|
|
398
|
+
except ClientError as e:
|
|
399
|
+
if not_found_error(e):
|
|
400
|
+
pass
|
|
401
|
+
else:
|
|
402
|
+
raise
|
|
403
|
+
|
|
404
|
+
def __len__(self) -> int:
|
|
405
|
+
"""Return the number of key-value pairs in the dictionary.
|
|
406
|
+
|
|
407
|
+
Warning:
|
|
408
|
+
This operation can be very slow and expensive on large S3 buckets
|
|
409
|
+
as it must paginate through all objects under the dictionary's prefix.
|
|
410
|
+
Avoid using in performance-critical code.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
int: Number of stored items under this dictionary's root_prefix.
|
|
414
|
+
"""
|
|
415
|
+
|
|
416
|
+
num_files = 0
|
|
417
|
+
suffix = "." + self.serialization_format
|
|
418
|
+
|
|
419
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
420
|
+
page_iterator = paginator.paginate(
|
|
421
|
+
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
422
|
+
|
|
423
|
+
for page in page_iterator:
|
|
424
|
+
contents = page.get("Contents")
|
|
425
|
+
if not contents:
|
|
426
|
+
continue
|
|
427
|
+
for key in contents:
|
|
428
|
+
obj_name = key["Key"]
|
|
429
|
+
if obj_name.endswith(suffix):
|
|
430
|
+
num_files += 1
|
|
431
|
+
|
|
432
|
+
return num_files
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _generic_iter(self, result_type: set[str]):
|
|
436
|
+
"""Underlying implementation for items(), keys(), and values() iterators.
|
|
437
|
+
|
|
438
|
+
Paginates through S3 objects under the configured root_prefix and yields
|
|
439
|
+
keys, values, and/or timestamps according to the requested result_type.
|
|
440
|
+
S3 object keys are converted to SafeStrTuple instances by removing the
|
|
441
|
+
file extension and reversing digest-based signing if enabled.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
result_type: Non-empty subset of {"keys", "values", "timestamps"}
|
|
445
|
+
specifying which fields to yield from each dictionary entry.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Iterator: A generator that yields:
|
|
449
|
+
- SafeStrTuple if result_type == {"keys"}
|
|
450
|
+
- Any if result_type == {"values"}
|
|
451
|
+
- tuple[SafeStrTuple, Any] if result_type == {"keys", "values"}
|
|
452
|
+
- tuple including float timestamp if "timestamps" requested
|
|
453
|
+
|
|
454
|
+
Raises:
|
|
455
|
+
ValueError: If result_type is invalid (empty, not a set, or contains
|
|
456
|
+
unsupported field names).
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
self._process_generic_iter_args(result_type)
|
|
460
|
+
|
|
461
|
+
suffix = "." + self.serialization_format
|
|
462
|
+
ext_len = len(self.serialization_format) + 1
|
|
463
|
+
prefix_len = len(self.root_prefix)
|
|
464
|
+
|
|
465
|
+
def splitter(full_name: str) -> SafeStrTuple:
|
|
466
|
+
"""Convert an S3 object key into a SafeStrTuple without the file extension.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
full_name: Complete S3 object key including root_prefix and extension.
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
SafeStrTuple: The parsed key components with digest signatures intact.
|
|
473
|
+
|
|
474
|
+
Raises:
|
|
475
|
+
ValueError: If the object key does not start with this dictionary's
|
|
476
|
+
root_prefix (indicating it's outside the dictionary's scope).
|
|
477
|
+
"""
|
|
478
|
+
if not full_name.startswith(self.root_prefix):
|
|
479
|
+
raise ValueError(
|
|
480
|
+
f"S3 object key '{full_name}' is outside of root_prefix '{self.root_prefix}'"
|
|
481
|
+
)
|
|
482
|
+
result = full_name[prefix_len:-ext_len].split(sep="/")
|
|
483
|
+
return SafeStrTuple(result)
|
|
484
|
+
|
|
485
|
+
def step():
|
|
486
|
+
"""Generator that paginates through S3 objects and yields requested data.
|
|
487
|
+
|
|
488
|
+
Yields dictionary entries (keys, values, timestamps) according to the
|
|
489
|
+
result_type specification from the parent _generic_iter method.
|
|
490
|
+
"""
|
|
491
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
492
|
+
page_iterator = paginator.paginate(
|
|
493
|
+
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
494
|
+
|
|
495
|
+
for page in page_iterator:
|
|
496
|
+
contents = page.get("Contents")
|
|
497
|
+
if not contents:
|
|
498
|
+
continue
|
|
499
|
+
for key in contents:
|
|
500
|
+
obj_name = key["Key"]
|
|
501
|
+
if not obj_name.endswith(suffix):
|
|
502
|
+
continue
|
|
503
|
+
obj_key = splitter(obj_name)
|
|
504
|
+
|
|
505
|
+
to_return = []
|
|
506
|
+
unsigned_key = unsign_safe_str_tuple(
|
|
507
|
+
obj_key, 0)
|
|
508
|
+
|
|
509
|
+
if "keys" in result_type:
|
|
510
|
+
to_return.append(unsigned_key)
|
|
511
|
+
|
|
512
|
+
if "values" in result_type:
|
|
513
|
+
# The object can be deleted between listing and fetching.
|
|
514
|
+
# Skip such races instead of raising to make iteration robust.
|
|
515
|
+
try:
|
|
516
|
+
value_to_return = self[unsigned_key]
|
|
517
|
+
except KeyError:
|
|
518
|
+
continue
|
|
519
|
+
to_return.append(value_to_return)
|
|
520
|
+
|
|
521
|
+
if len(result_type) == 1:
|
|
522
|
+
yield to_return[0]
|
|
523
|
+
else:
|
|
524
|
+
if "timestamps" in result_type:
|
|
525
|
+
timestamp_to_return = key["LastModified"].timestamp()
|
|
526
|
+
to_return.append(timestamp_to_return)
|
|
527
|
+
yield tuple(to_return)
|
|
528
|
+
|
|
529
|
+
return step()
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def get_subdict(self, key:PersiDictKey) -> 'BasicS3Dict':
|
|
533
|
+
"""Create a subdictionary scoped to items with the specified prefix.
|
|
534
|
+
|
|
535
|
+
Returns an empty subdictionary if no items exist under the prefix.
|
|
536
|
+
If the prefix is empty, the entire dictionary is returned.
|
|
537
|
+
This method is not part of the standard Python dictionary interface.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
key: A common prefix (string or sequence of strings or SafeStrTuple)
|
|
541
|
+
used to scope items stored under this dictionary.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
BasicS3Dict: A new BasicS3Dict instance with root_prefix
|
|
545
|
+
extended by the given key, sharing the parent's bucket,
|
|
546
|
+
region, serialization_format, and other configuration settings.
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
key = SafeStrTuple(key)
|
|
550
|
+
if len(key):
|
|
551
|
+
key = sign_safe_str_tuple(key, 0)
|
|
552
|
+
full_root_prefix = self.root_prefix + "/".join(key)
|
|
553
|
+
else:
|
|
554
|
+
full_root_prefix = self.root_prefix
|
|
555
|
+
|
|
556
|
+
new_dict = BasicS3Dict(
|
|
557
|
+
bucket_name=self.bucket_name,
|
|
558
|
+
region=self.region,
|
|
559
|
+
root_prefix=full_root_prefix,
|
|
560
|
+
serialization_format=self.serialization_format,
|
|
561
|
+
append_only=self.append_only,
|
|
562
|
+
base_class_for_values=self.base_class_for_values)
|
|
563
|
+
|
|
564
|
+
return new_dict
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def timestamp(self, key: NonEmptyPersiDictKey) -> float:
|
|
568
|
+
"""Get the last modification timestamp for a key.
|
|
569
|
+
|
|
570
|
+
This method is not part of the standard Python dictionary interface.
|
|
571
|
+
|
|
572
|
+
Args:
|
|
573
|
+
key: Dictionary key (string or sequence of strings
|
|
574
|
+
or NonEmptySafeStrTuple).
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
float: POSIX timestamp (seconds since Unix epoch) of the last
|
|
578
|
+
modification time as reported by S3. The timestamp is timezone-aware
|
|
579
|
+
and converted to UTC.
|
|
580
|
+
|
|
581
|
+
Raises:
|
|
582
|
+
KeyError: If the key does not exist in S3.
|
|
583
|
+
"""
|
|
584
|
+
key = NonEmptySafeStrTuple(key)
|
|
585
|
+
obj_name = self._build_full_objectname(key)
|
|
586
|
+
try:
|
|
587
|
+
response = self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
588
|
+
return response["LastModified"].timestamp()
|
|
589
|
+
except ClientError as e:
|
|
590
|
+
if not_found_error(e):
|
|
591
|
+
raise KeyError(f"Key {key} not found in S3 bucket {self.bucket_name}")
|
|
592
|
+
else:
|
|
593
|
+
raise
|
|
594
|
+
|
|
595
|
+
parameterizable.register_parameterizable_class(BasicS3Dict)
|