persidict 0.38.0__py3-none-any.whl → 0.103.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of persidict might be problematic. Click here for more details.
- persidict/__init__.py +41 -24
- persidict/basic_s3_dict.py +595 -0
- persidict/cached_appendonly_dict.py +247 -0
- persidict/cached_mutable_dict.py +248 -0
- persidict/empty_dict.py +171 -0
- persidict/file_dir_dict.py +130 -122
- persidict/local_dict.py +502 -0
- persidict/overlapping_multi_dict.py +23 -15
- persidict/persi_dict.py +281 -148
- persidict/s3_dict_file_dir_cached.py +215 -0
- persidict/{s3_dict.py → s3_dict_legacy.py} +111 -90
- persidict/safe_chars.py +13 -0
- persidict/safe_str_tuple.py +28 -6
- persidict/singletons.py +232 -0
- persidict/write_once_dict.py +47 -30
- {persidict-0.38.0.dist-info → persidict-0.103.0.dist-info}/METADATA +34 -24
- persidict-0.103.0.dist-info/RECORD +19 -0
- {persidict-0.38.0.dist-info → persidict-0.103.0.dist-info}/WHEEL +1 -1
- persidict/.DS_Store +0 -0
- persidict/jokers.py +0 -99
- persidict-0.38.0.dist-info/RECORD +0 -14
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""S3Dict_FileDirCached implementation that mimics S3Dict_Legacy but uses BasicS3Dict, FileDirDict, and cached classes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import parameterizable
|
|
8
|
+
from parameterizable import sort_dict_by_keys
|
|
9
|
+
|
|
10
|
+
from .basic_s3_dict import BasicS3Dict
|
|
11
|
+
from .file_dir_dict import FileDirDict, FILEDIRDICT_DEFAULT_BASE_DIR
|
|
12
|
+
from .cached_appendonly_dict import AppendOnlyDictCached
|
|
13
|
+
from .cached_mutable_dict import MutableDictCached
|
|
14
|
+
from .persi_dict import PersiDict, NonEmptyPersiDictKey, PersiDictKey
|
|
15
|
+
from .safe_str_tuple import NonEmptySafeStrTuple
|
|
16
|
+
from .overlapping_multi_dict import OverlappingMultiDict
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Default base directory for S3Dict_FileDirCached local cache
|
|
20
|
+
S3DICT_NEW_DEFAULT_BASE_DIR = "__s3_dict__"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class S3Dict_FileDirCached(PersiDict):
|
|
24
|
+
"""S3-backed persistent dictionary using BasicS3Dict with local caching.
|
|
25
|
+
|
|
26
|
+
This class mimics the interface and behavior of S3Dict_Legacy but internally uses
|
|
27
|
+
BasicS3Dict for S3 operations combined with FileDirDict-based local caching
|
|
28
|
+
via the cached wrapper classes (AppendOnlyDictCached/MutableDictCached).
|
|
29
|
+
|
|
30
|
+
The architecture layers caching on top of BasicS3Dict to provide:
|
|
31
|
+
- Fast local access for frequently accessed items
|
|
32
|
+
- Efficient batch operations
|
|
33
|
+
- ETag-based change detection for mutable dictionaries
|
|
34
|
+
- Optimized append-only performance when append_only=True
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, bucket_name: str = "my_bucket",
|
|
38
|
+
region: str = None,
|
|
39
|
+
root_prefix: str = "",
|
|
40
|
+
base_dir: str = S3DICT_NEW_DEFAULT_BASE_DIR,
|
|
41
|
+
serialization_format: str = "pkl",
|
|
42
|
+
digest_len: int = 8,
|
|
43
|
+
append_only: bool = False,
|
|
44
|
+
base_class_for_values: Optional[type] = None,
|
|
45
|
+
*args, **kwargs):
|
|
46
|
+
"""Initialize an S3-backed persistent dictionary with local caching.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
bucket_name: Name of the S3 bucket to use.
|
|
50
|
+
region: AWS region for the bucket.
|
|
51
|
+
root_prefix: Common S3 key prefix under which all objects are stored.
|
|
52
|
+
base_dir: Local directory path for caching.
|
|
53
|
+
serialization_format: File extension/format for stored values.
|
|
54
|
+
digest_len: Number of base32 MD5 hash characters for collision prevention.
|
|
55
|
+
append_only: If True, prevents modification/deletion of existing items.
|
|
56
|
+
base_class_for_values: Optional base class that all stored values must inherit from.
|
|
57
|
+
*args: Additional positional arguments.
|
|
58
|
+
**kwargs: Additional keyword arguments.
|
|
59
|
+
"""
|
|
60
|
+
super().__init__(append_only=append_only,
|
|
61
|
+
base_class_for_values=base_class_for_values,
|
|
62
|
+
serialization_format=serialization_format)
|
|
63
|
+
|
|
64
|
+
# Create the main S3 storage using BasicS3Dict
|
|
65
|
+
self._main_dict = BasicS3Dict(
|
|
66
|
+
bucket_name=bucket_name,
|
|
67
|
+
region=region,
|
|
68
|
+
root_prefix=root_prefix,
|
|
69
|
+
serialization_format=serialization_format,
|
|
70
|
+
append_only=append_only,
|
|
71
|
+
base_class_for_values=base_class_for_values
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Set up local cache parameters for FileDirDict
|
|
75
|
+
individual_subdicts_params = {self.serialization_format: {}}
|
|
76
|
+
|
|
77
|
+
if not append_only:
|
|
78
|
+
self.etag_serialization_format = f"{self.serialization_format}_etag"
|
|
79
|
+
individual_subdicts_params[self.etag_serialization_format] = {
|
|
80
|
+
"base_class_for_values": str}
|
|
81
|
+
|
|
82
|
+
# Create local cache using OverlappingMultiDict with FileDirDict
|
|
83
|
+
self.local_cache = OverlappingMultiDict(
|
|
84
|
+
dict_type=FileDirDict,
|
|
85
|
+
shared_subdicts_params={
|
|
86
|
+
"base_dir": base_dir,
|
|
87
|
+
"append_only": append_only,
|
|
88
|
+
"base_class_for_values": base_class_for_values,
|
|
89
|
+
"digest_len": digest_len
|
|
90
|
+
},
|
|
91
|
+
**individual_subdicts_params)
|
|
92
|
+
|
|
93
|
+
# Get the data cache
|
|
94
|
+
self._data_cache = getattr(self.local_cache, self.serialization_format)
|
|
95
|
+
|
|
96
|
+
# Create the appropriate cached wrapper
|
|
97
|
+
if append_only:
|
|
98
|
+
# Use AppendOnlyDictCached for append-only mode
|
|
99
|
+
self._cached_dict = AppendOnlyDictCached(
|
|
100
|
+
main_dict=self._main_dict,
|
|
101
|
+
data_cache=self._data_cache
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
# Use MutableDictCached for mutable mode with ETag cache
|
|
105
|
+
self._etag_cache = getattr(self.local_cache, self.etag_serialization_format)
|
|
106
|
+
self._cached_dict = MutableDictCached(
|
|
107
|
+
main_dict=self._main_dict,
|
|
108
|
+
data_cache=self._data_cache,
|
|
109
|
+
etag_cache=self._etag_cache
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def digest_len(self) -> int:
|
|
114
|
+
"""Get the digest length used for collision prevention."""
|
|
115
|
+
return self._data_cache.digest_len
|
|
116
|
+
|
|
117
|
+
def get_params(self):
|
|
118
|
+
"""Return configuration parameters as a dictionary."""
|
|
119
|
+
# Get params from the main dict and local cache
|
|
120
|
+
params = self._main_dict.get_params()
|
|
121
|
+
cache_params = self._data_cache.get_params()
|
|
122
|
+
|
|
123
|
+
# Add cache-specific params
|
|
124
|
+
params["base_dir"] = cache_params["base_dir"]
|
|
125
|
+
params["digest_len"] = cache_params["digest_len"]
|
|
126
|
+
|
|
127
|
+
params = sort_dict_by_keys(params)
|
|
128
|
+
|
|
129
|
+
return params
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def base_url(self) -> str:
|
|
133
|
+
"""Get the base S3 URL."""
|
|
134
|
+
return self._main_dict.base_url
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def base_dir(self) -> str:
|
|
138
|
+
"""Get the base directory for local cache."""
|
|
139
|
+
return self._data_cache.base_dir
|
|
140
|
+
|
|
141
|
+
def __contains__(self, key: NonEmptyPersiDictKey) -> bool:
|
|
142
|
+
"""Check if key exists in the dictionary."""
|
|
143
|
+
return self._cached_dict.__contains__(key)
|
|
144
|
+
|
|
145
|
+
def __getitem__(self, key: NonEmptyPersiDictKey) -> Any:
|
|
146
|
+
"""Get item from dictionary."""
|
|
147
|
+
return self._cached_dict.__getitem__(key)
|
|
148
|
+
|
|
149
|
+
def __setitem__(self, key: NonEmptyPersiDictKey, value: Any) -> None:
|
|
150
|
+
"""Set item in dictionary."""
|
|
151
|
+
self._cached_dict.__setitem__(key, value)
|
|
152
|
+
|
|
153
|
+
def __delitem__(self, key: NonEmptyPersiDictKey) -> None:
|
|
154
|
+
"""Delete item from dictionary."""
|
|
155
|
+
self._cached_dict.__delitem__(key)
|
|
156
|
+
|
|
157
|
+
def __len__(self) -> int:
|
|
158
|
+
"""Get number of items in dictionary."""
|
|
159
|
+
return self._cached_dict.__len__()
|
|
160
|
+
|
|
161
|
+
def _generic_iter(self, result_type: set[str]):
|
|
162
|
+
"""Generic iteration over dictionary items."""
|
|
163
|
+
return self._cached_dict._generic_iter(result_type)
|
|
164
|
+
|
|
165
|
+
def get_subdict(self, key: PersiDictKey):
|
|
166
|
+
"""Get a subdictionary for the given key prefix."""
|
|
167
|
+
return self._main_dict.get_subdict(key)
|
|
168
|
+
|
|
169
|
+
def timestamp(self, key: NonEmptyPersiDictKey):
|
|
170
|
+
"""Get the timestamp of when the item was last modified."""
|
|
171
|
+
return self._cached_dict.timestamp(key)
|
|
172
|
+
|
|
173
|
+
# Additional methods that might be needed for ETag support
|
|
174
|
+
def get_item_if_etag_changed(self, key: NonEmptyPersiDictKey, etag: Optional[str]):
|
|
175
|
+
"""Get item only if ETag has changed (for mutable dicts)."""
|
|
176
|
+
if hasattr(self._cached_dict, 'get_item_if_etag_changed'):
|
|
177
|
+
return self._cached_dict.get_item_if_etag_changed(key, etag)
|
|
178
|
+
else:
|
|
179
|
+
# For append-only dicts, just get the item
|
|
180
|
+
return self._cached_dict.__getitem__(key)
|
|
181
|
+
|
|
182
|
+
def set_item_get_etag(self, key: NonEmptyPersiDictKey, value: Any):
|
|
183
|
+
"""Set item and return ETag (for mutable dicts)."""
|
|
184
|
+
if hasattr(self._cached_dict, 'set_item_get_etag'):
|
|
185
|
+
return self._cached_dict.set_item_get_etag(key, value)
|
|
186
|
+
else:
|
|
187
|
+
# For append-only dicts, just set the item
|
|
188
|
+
self._cached_dict.__setitem__(key, value)
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
def discard(self, key: NonEmptyPersiDictKey) -> bool:
|
|
192
|
+
"""Delete an item without raising an exception if it doesn't exist.
|
|
193
|
+
|
|
194
|
+
This method fixes the issue where cached dictionaries return multiple
|
|
195
|
+
success counts for a single key deletion.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
key: Key to delete.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
bool: True if the item existed and was deleted; False otherwise.
|
|
202
|
+
"""
|
|
203
|
+
key = NonEmptySafeStrTuple(key)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
del self[key]
|
|
207
|
+
return True
|
|
208
|
+
except KeyError:
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
S3Dict = S3Dict_FileDirCached # Alias for backward compatibility
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
parameterizable.register_parameterizable_class(S3Dict_FileDirCached)
|
|
@@ -10,47 +10,55 @@ from botocore.exceptions import ClientError
|
|
|
10
10
|
import parameterizable
|
|
11
11
|
from parameterizable.dict_sorter import sort_dict_by_keys
|
|
12
12
|
|
|
13
|
-
from .safe_str_tuple import SafeStrTuple
|
|
13
|
+
from .safe_str_tuple import SafeStrTuple, NonEmptySafeStrTuple
|
|
14
14
|
from .safe_str_tuple_signing import sign_safe_str_tuple, unsign_safe_str_tuple
|
|
15
|
-
from .persi_dict import PersiDict
|
|
16
|
-
from .
|
|
17
|
-
from .file_dir_dict import FileDirDict, PersiDictKey
|
|
15
|
+
from .persi_dict import PersiDict, NonEmptyPersiDictKey
|
|
16
|
+
from .singletons import Joker, EXECUTION_IS_COMPLETE, ETagHasNotChangedFlag
|
|
17
|
+
from .file_dir_dict import FileDirDict, PersiDictKey
|
|
18
18
|
from .overlapping_multi_dict import OverlappingMultiDict
|
|
19
19
|
|
|
20
20
|
S3DICT_DEFAULT_BASE_DIR = "__s3_dict__"
|
|
21
21
|
|
|
22
|
-
class
|
|
23
|
-
"""A persistent dictionary that stores key-value pairs as S3 objects.
|
|
22
|
+
class S3Dict_Legacy(PersiDict):
|
|
23
|
+
"""A persistent dictionary that stores key-value pairs as S3 objects with local caching.
|
|
24
24
|
|
|
25
25
|
Each key-value pair is stored as a separate S3 object in the specified bucket.
|
|
26
|
+
S3Dict_Legacy provides intelligent local caching to minimize S3 API calls and improve
|
|
27
|
+
performance by using conditional requests with ETags to detect changes.
|
|
26
28
|
|
|
27
29
|
A key can be either a string (object name without file extension) or a sequence
|
|
28
30
|
of strings representing a hierarchical path (folder structure ending with an
|
|
29
31
|
object name). Values can be instances of any Python type and are serialized
|
|
30
32
|
to S3 objects.
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
S3Dict_Legacy supports multiple serialization formats:
|
|
33
35
|
- Binary storage using pickle ('pkl' format)
|
|
34
36
|
- Human-readable text using jsonpickle ('json' format)
|
|
35
37
|
- Plain text for string values (other formats)
|
|
36
38
|
|
|
39
|
+
Key Features:
|
|
40
|
+
- Local file-based caching for improved read performance
|
|
41
|
+
- ETag-based conditional requests to minimize unnecessary downloads
|
|
42
|
+
- Automatic cache invalidation when S3 objects change
|
|
43
|
+
- Seamless fallback to S3 when cached data is stale
|
|
44
|
+
|
|
37
45
|
Note:
|
|
38
46
|
Unlike native Python dictionaries, insertion order is not preserved.
|
|
39
|
-
Operations may incur S3 API costs and network latency
|
|
47
|
+
Operations may incur S3 API costs and network latency, though caching
|
|
48
|
+
significantly reduces this overhead for repeated access patterns.
|
|
40
49
|
"""
|
|
41
50
|
region: str
|
|
42
51
|
bucket_name: str
|
|
43
52
|
root_prefix: str
|
|
44
|
-
file_type: str
|
|
45
53
|
_base_dir: str
|
|
46
54
|
|
|
47
55
|
def __init__(self, bucket_name: str = "my_bucket",
|
|
48
56
|
region: str = None,
|
|
49
57
|
root_prefix: str = "",
|
|
50
58
|
base_dir: str = S3DICT_DEFAULT_BASE_DIR,
|
|
51
|
-
|
|
52
|
-
immutable_items: bool = False,
|
|
59
|
+
serialization_format: str = "pkl",
|
|
53
60
|
digest_len: int = 8,
|
|
61
|
+
append_only: bool = False,
|
|
54
62
|
base_class_for_values: Optional[type] = None,
|
|
55
63
|
*args, **kwargs):
|
|
56
64
|
"""Initialize an S3-backed persistent dictionary.
|
|
@@ -64,15 +72,15 @@ class S3Dict(PersiDict):
|
|
|
64
72
|
stored. A trailing slash is automatically added if missing.
|
|
65
73
|
base_dir: Local directory path used for temporary files and
|
|
66
74
|
local caching of S3 objects.
|
|
67
|
-
|
|
75
|
+
serialization_format: File extension/format for stored values. Supported formats:
|
|
68
76
|
'pkl' (pickle), 'json' (jsonpickle), or custom text formats.
|
|
69
|
-
|
|
77
|
+
append_only: If True, prevents modification of existing items
|
|
70
78
|
after they are initially stored.
|
|
71
79
|
digest_len: Number of base32 MD5 hash characters appended to key
|
|
72
80
|
elements to prevent case-insensitive filename collisions.
|
|
73
81
|
Set to 0 to disable collision prevention.
|
|
74
82
|
base_class_for_values: Optional base class that all stored values
|
|
75
|
-
must inherit from. When specified (and not str),
|
|
83
|
+
must inherit from. When specified (and not str), serialization_format
|
|
76
84
|
must be 'pkl' or 'json' for proper serialization.
|
|
77
85
|
*args: Additional positional arguments (ignored, reserved for compatibility).
|
|
78
86
|
**kwargs: Additional keyword arguments (ignored, reserved for compatibility).
|
|
@@ -82,28 +90,29 @@ class S3Dict(PersiDict):
|
|
|
82
90
|
allow. Network connectivity and valid AWS credentials are required.
|
|
83
91
|
"""
|
|
84
92
|
|
|
85
|
-
super().__init__(
|
|
86
|
-
,
|
|
87
|
-
|
|
88
|
-
self.
|
|
89
|
-
|
|
93
|
+
super().__init__(append_only=append_only,
|
|
94
|
+
base_class_for_values=base_class_for_values,
|
|
95
|
+
serialization_format=serialization_format)
|
|
96
|
+
individual_subdicts_params = {self.serialization_format: {}}
|
|
97
|
+
|
|
98
|
+
if not append_only:
|
|
99
|
+
self.etag_serialization_format = f"{self.serialization_format}_etag"
|
|
100
|
+
individual_subdicts_params[self.etag_serialization_format] = {
|
|
101
|
+
"base_class_for_values": str}
|
|
90
102
|
|
|
91
103
|
self.local_cache = OverlappingMultiDict(
|
|
92
104
|
dict_type=FileDirDict,
|
|
93
105
|
shared_subdicts_params={
|
|
94
106
|
"base_dir": base_dir,
|
|
95
|
-
"
|
|
107
|
+
"append_only": append_only,
|
|
96
108
|
"base_class_for_values": base_class_for_values,
|
|
97
109
|
"digest_len": digest_len
|
|
98
110
|
},
|
|
99
|
-
**
|
|
100
|
-
self.file_type: {},
|
|
101
|
-
self.etag_file_type: {"base_class_for_values": str}
|
|
102
|
-
}
|
|
103
|
-
)
|
|
111
|
+
**individual_subdicts_params)
|
|
104
112
|
|
|
105
|
-
self.main_cache = getattr(self.local_cache, self.
|
|
106
|
-
|
|
113
|
+
self.main_cache = getattr(self.local_cache, self.serialization_format)
|
|
114
|
+
if not self.append_only:
|
|
115
|
+
self.etag_cache = getattr(self.local_cache, self.etag_serialization_format)
|
|
107
116
|
|
|
108
117
|
self.region = region
|
|
109
118
|
if region is None:
|
|
@@ -138,10 +147,14 @@ class S3Dict(PersiDict):
|
|
|
138
147
|
|
|
139
148
|
self.bucket_name = bucket_name
|
|
140
149
|
|
|
141
|
-
self.root_prefix=root_prefix
|
|
150
|
+
self.root_prefix = root_prefix
|
|
142
151
|
if len(self.root_prefix) and self.root_prefix[-1] != "/":
|
|
143
152
|
self.root_prefix += "/"
|
|
144
153
|
|
|
154
|
+
@property
|
|
155
|
+
def digest_len(self) -> int:
|
|
156
|
+
return self.main_cache.digest_len
|
|
157
|
+
|
|
145
158
|
|
|
146
159
|
def get_params(self):
|
|
147
160
|
"""Return configuration parameters as a dictionary.
|
|
@@ -187,23 +200,23 @@ class S3Dict(PersiDict):
|
|
|
187
200
|
return self.main_cache.base_dir
|
|
188
201
|
|
|
189
202
|
|
|
190
|
-
def _build_full_objectname(self, key:
|
|
203
|
+
def _build_full_objectname(self, key: NonEmptyPersiDictKey) -> str:
|
|
191
204
|
"""Convert a key into a full S3 object key.
|
|
192
205
|
|
|
193
206
|
Args:
|
|
194
207
|
key: Dictionary key (string or sequence of strings) or SafeStrTuple.
|
|
195
208
|
|
|
196
209
|
Returns:
|
|
197
|
-
str: The complete S3 object key including root_prefix and
|
|
210
|
+
str: The complete S3 object key including root_prefix and serialization_format
|
|
198
211
|
extension, with digest-based collision prevention applied if enabled.
|
|
199
212
|
"""
|
|
200
|
-
key =
|
|
201
|
-
key = sign_safe_str_tuple(key,
|
|
202
|
-
objectname = self.root_prefix +
|
|
213
|
+
key = NonEmptySafeStrTuple(key)
|
|
214
|
+
key = sign_safe_str_tuple(key, 0)
|
|
215
|
+
objectname = self.root_prefix + "/".join(key) + "." + self.serialization_format
|
|
203
216
|
return objectname
|
|
204
217
|
|
|
205
218
|
|
|
206
|
-
def __contains__(self, key:
|
|
219
|
+
def __contains__(self, key: NonEmptyPersiDictKey) -> bool:
|
|
207
220
|
"""Check if the specified key exists in the dictionary.
|
|
208
221
|
|
|
209
222
|
For immutable dictionaries, checks the local cache first. Otherwise,
|
|
@@ -216,8 +229,8 @@ class S3Dict(PersiDict):
|
|
|
216
229
|
bool: True if the key exists in S3 (or local cache for immutable
|
|
217
230
|
items), False otherwise.
|
|
218
231
|
"""
|
|
219
|
-
key =
|
|
220
|
-
if self.
|
|
232
|
+
key = NonEmptySafeStrTuple(key)
|
|
233
|
+
if self.append_only and key in self.main_cache:
|
|
221
234
|
return True
|
|
222
235
|
try:
|
|
223
236
|
obj_name = self._build_full_objectname(key)
|
|
@@ -225,14 +238,14 @@ class S3Dict(PersiDict):
|
|
|
225
238
|
return True
|
|
226
239
|
except ClientError as e:
|
|
227
240
|
if e.response['ResponseMetadata']['HTTPStatusCode'] == 404:
|
|
228
|
-
self.main_cache.
|
|
229
|
-
self.etag_cache.
|
|
241
|
+
self.main_cache.discard(key)
|
|
242
|
+
self.etag_cache.discard(key)
|
|
230
243
|
return False
|
|
231
244
|
else:
|
|
232
245
|
raise
|
|
233
246
|
|
|
234
247
|
|
|
235
|
-
def __getitem__(self, key:
|
|
248
|
+
def __getitem__(self, key: NonEmptyPersiDictKey) -> Any:
|
|
236
249
|
"""Retrieve the value stored for a key.
|
|
237
250
|
|
|
238
251
|
For immutable dictionaries with cached values, returns the cached copy.
|
|
@@ -249,15 +262,15 @@ class S3Dict(PersiDict):
|
|
|
249
262
|
KeyError: If the key does not exist in S3.
|
|
250
263
|
"""
|
|
251
264
|
|
|
252
|
-
key =
|
|
265
|
+
key = NonEmptySafeStrTuple(key)
|
|
253
266
|
|
|
254
|
-
if self.
|
|
267
|
+
if self.append_only and key in self.main_cache:
|
|
255
268
|
return self.main_cache[key]
|
|
256
269
|
|
|
257
270
|
obj_name = self._build_full_objectname(key)
|
|
258
271
|
|
|
259
272
|
cached_etag = None
|
|
260
|
-
if not self.
|
|
273
|
+
if not self.append_only and key in self.main_cache and key in self.etag_cache:
|
|
261
274
|
cached_etag = self.etag_cache[key]
|
|
262
275
|
|
|
263
276
|
try:
|
|
@@ -268,19 +281,22 @@ class S3Dict(PersiDict):
|
|
|
268
281
|
response = self.s3_client.get_object(**get_kwargs)
|
|
269
282
|
|
|
270
283
|
# 200 OK: object was downloaded, either because it's new or changed.
|
|
271
|
-
s3_etag = response.get("ETag")
|
|
272
284
|
body = response['Body']
|
|
273
285
|
|
|
274
286
|
# Deserialize and cache the S3 object content
|
|
275
|
-
if self.
|
|
287
|
+
if self.serialization_format == 'json':
|
|
276
288
|
deserialized_value = jsonpickle.loads(body.read().decode('utf-8'))
|
|
277
|
-
elif self.
|
|
289
|
+
elif self.serialization_format == 'pkl':
|
|
278
290
|
deserialized_value = joblib.load(body)
|
|
279
291
|
else:
|
|
280
292
|
deserialized_value = body.read().decode('utf-8')
|
|
281
293
|
|
|
282
294
|
self.main_cache[key] = deserialized_value
|
|
283
|
-
|
|
295
|
+
|
|
296
|
+
if not self.append_only:
|
|
297
|
+
# Cache the S3 ETag for future conditional requests
|
|
298
|
+
s3_etag = response.get("ETag")
|
|
299
|
+
self.etag_cache[key] = s3_etag
|
|
284
300
|
|
|
285
301
|
except ClientError as e:
|
|
286
302
|
if e.response['ResponseMetadata']['HTTPStatusCode'] == 304:
|
|
@@ -295,7 +311,7 @@ class S3Dict(PersiDict):
|
|
|
295
311
|
return self.main_cache[key]
|
|
296
312
|
|
|
297
313
|
|
|
298
|
-
def __setitem__(self, key:
|
|
314
|
+
def __setitem__(self, key: NonEmptyPersiDictKey, value: Any):
|
|
299
315
|
"""Store a value for a key in both S3 and local cache.
|
|
300
316
|
|
|
301
317
|
Handles special joker values (KEEP_CURRENT, DELETE_CURRENT) for
|
|
@@ -304,22 +320,20 @@ class S3Dict(PersiDict):
|
|
|
304
320
|
the S3 ETag for efficient future retrievals.
|
|
305
321
|
|
|
306
322
|
Args:
|
|
307
|
-
key: Dictionary key (string or sequence of strings) or
|
|
323
|
+
key: Dictionary key (string or sequence of strings) or NonEmptyPersiDictKey.
|
|
308
324
|
value: Value to store, or a joker command (KEEP_CURRENT or
|
|
309
|
-
DELETE_CURRENT
|
|
325
|
+
DELETE_CURRENT).
|
|
310
326
|
|
|
311
327
|
Raises:
|
|
312
328
|
KeyError: If attempting to modify an existing item when
|
|
313
|
-
|
|
329
|
+
append_only is True.
|
|
314
330
|
TypeError: If value is a PersiDict instance or does not match
|
|
315
331
|
the required base_class_for_values when specified.
|
|
316
332
|
"""
|
|
317
333
|
|
|
318
|
-
key =
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
# Joker values (KEEP_CURRENT, DELETE_CURRENT) are handled by base class
|
|
322
|
-
return
|
|
334
|
+
key = NonEmptySafeStrTuple(key)
|
|
335
|
+
if self._process_setitem_args(key, value) is EXECUTION_IS_COMPLETE:
|
|
336
|
+
return None
|
|
323
337
|
|
|
324
338
|
obj_name = self._build_full_objectname(key)
|
|
325
339
|
|
|
@@ -330,6 +344,11 @@ class S3Dict(PersiDict):
|
|
|
330
344
|
file_path = self.main_cache._build_full_path(key)
|
|
331
345
|
self.s3_client.upload_file(file_path, self.bucket_name, obj_name)
|
|
332
346
|
|
|
347
|
+
if self.append_only:
|
|
348
|
+
# For immutable items, the local cache is authoritative; no need to
|
|
349
|
+
# verify ETag from S3 as the item cannot change after initial upload
|
|
350
|
+
return
|
|
351
|
+
|
|
333
352
|
try:
|
|
334
353
|
# Cache the S3 ETag for efficient conditional requests on future reads
|
|
335
354
|
head = self.s3_client.head_object(
|
|
@@ -337,24 +356,25 @@ class S3Dict(PersiDict):
|
|
|
337
356
|
self.etag_cache[key] = head.get("ETag")
|
|
338
357
|
except ClientError:
|
|
339
358
|
# Remove stale ETag on failure to force fresh downloads later
|
|
340
|
-
self.etag_cache.
|
|
359
|
+
self.etag_cache.discard(key)
|
|
341
360
|
|
|
342
361
|
|
|
343
|
-
def __delitem__(self, key:
|
|
362
|
+
def __delitem__(self, key: NonEmptyPersiDictKey):
|
|
344
363
|
"""Delete the stored value for a key from both S3 and local cache.
|
|
345
364
|
|
|
346
365
|
Args:
|
|
347
|
-
key: Dictionary key (string or sequence of strings)
|
|
366
|
+
key: Dictionary key (string or sequence of strings)
|
|
367
|
+
or NonEmptyPersiDictKey.
|
|
348
368
|
|
|
349
369
|
Raises:
|
|
350
|
-
KeyError: If
|
|
370
|
+
KeyError: If append_only is True, or if the key does not exist.
|
|
351
371
|
"""
|
|
352
|
-
key =
|
|
353
|
-
|
|
372
|
+
key = NonEmptySafeStrTuple(key)
|
|
373
|
+
self._process_delitem_args(key)
|
|
354
374
|
obj_name = self._build_full_objectname(key)
|
|
355
|
-
self.s3_client.delete_object(Bucket
|
|
356
|
-
self.etag_cache.
|
|
357
|
-
self.main_cache.
|
|
375
|
+
self.s3_client.delete_object(Bucket=self.bucket_name, Key=obj_name)
|
|
376
|
+
self.etag_cache.discard(key)
|
|
377
|
+
self.main_cache.discard(key)
|
|
358
378
|
|
|
359
379
|
|
|
360
380
|
def __len__(self) -> int:
|
|
@@ -370,11 +390,11 @@ class S3Dict(PersiDict):
|
|
|
370
390
|
"""
|
|
371
391
|
|
|
372
392
|
num_files = 0
|
|
373
|
-
suffix = "." + self.
|
|
393
|
+
suffix = "." + self.serialization_format
|
|
374
394
|
|
|
375
395
|
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
376
396
|
page_iterator = paginator.paginate(
|
|
377
|
-
Bucket=self.bucket_name, Prefix
|
|
397
|
+
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
378
398
|
|
|
379
399
|
for page in page_iterator:
|
|
380
400
|
contents = page.get("Contents")
|
|
@@ -412,10 +432,10 @@ class S3Dict(PersiDict):
|
|
|
412
432
|
unsupported field names).
|
|
413
433
|
"""
|
|
414
434
|
|
|
415
|
-
|
|
435
|
+
self._process_generic_iter_args(result_type)
|
|
416
436
|
|
|
417
|
-
suffix = "." + self.
|
|
418
|
-
ext_len = len(self.
|
|
437
|
+
suffix = "." + self.serialization_format
|
|
438
|
+
ext_len = len(self.serialization_format) + 1
|
|
419
439
|
prefix_len = len(self.root_prefix)
|
|
420
440
|
|
|
421
441
|
def splitter(full_name: str) -> SafeStrTuple:
|
|
@@ -446,7 +466,7 @@ class S3Dict(PersiDict):
|
|
|
446
466
|
"""
|
|
447
467
|
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
448
468
|
page_iterator = paginator.paginate(
|
|
449
|
-
Bucket=self.bucket_name, Prefix
|
|
469
|
+
Bucket=self.bucket_name, Prefix=self.root_prefix)
|
|
450
470
|
|
|
451
471
|
for page in page_iterator:
|
|
452
472
|
contents = page.get("Contents")
|
|
@@ -462,7 +482,7 @@ class S3Dict(PersiDict):
|
|
|
462
482
|
|
|
463
483
|
if "keys" in result_type:
|
|
464
484
|
key_to_return = unsign_safe_str_tuple(
|
|
465
|
-
obj_key,
|
|
485
|
+
obj_key, 0)
|
|
466
486
|
to_return.append(key_to_return)
|
|
467
487
|
|
|
468
488
|
if "values" in result_type:
|
|
@@ -480,10 +500,11 @@ class S3Dict(PersiDict):
|
|
|
480
500
|
return step()
|
|
481
501
|
|
|
482
502
|
|
|
483
|
-
def get_subdict(self, key: PersiDictKey) ->
|
|
503
|
+
def get_subdict(self, key: PersiDictKey) -> S3Dict_Legacy:
|
|
484
504
|
"""Create a subdictionary scoped to items with the specified prefix.
|
|
485
505
|
|
|
486
506
|
Returns an empty subdictionary if no items exist under the prefix.
|
|
507
|
+
If the prefix is empty, the entire dictionary is returned.
|
|
487
508
|
This method is not part of the standard Python dictionary interface.
|
|
488
509
|
|
|
489
510
|
Args:
|
|
@@ -491,35 +512,34 @@ class S3Dict(PersiDict):
|
|
|
491
512
|
used to scope items stored under this dictionary.
|
|
492
513
|
|
|
493
514
|
Returns:
|
|
494
|
-
|
|
495
|
-
key, sharing the parent's bucket, region,
|
|
515
|
+
S3Dict_Legacy: A new S3Dict instance with root_prefix extended by the given
|
|
516
|
+
key, sharing the parent's bucket, region, serialization_format, and other
|
|
496
517
|
configuration settings.
|
|
497
518
|
"""
|
|
498
519
|
|
|
499
520
|
key = SafeStrTuple(key)
|
|
500
521
|
if len(key):
|
|
501
|
-
key = sign_safe_str_tuple(key,
|
|
502
|
-
full_root_prefix = self.root_prefix +
|
|
522
|
+
key = sign_safe_str_tuple(key, 0)
|
|
523
|
+
full_root_prefix = self.root_prefix + "/".join(key)
|
|
503
524
|
else:
|
|
504
525
|
full_root_prefix = self.root_prefix
|
|
505
526
|
|
|
506
527
|
new_dir_path = self.main_cache._build_full_path(
|
|
507
|
-
key, create_subdirs
|
|
508
|
-
|
|
509
|
-
new_dict =
|
|
510
|
-
bucket_name
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
, base_class_for_values = self.base_class_for_values)
|
|
528
|
+
key, create_subdirs=True, is_file_path=False)
|
|
529
|
+
|
|
530
|
+
new_dict = S3Dict_Legacy(
|
|
531
|
+
bucket_name=self.bucket_name,
|
|
532
|
+
region=self.region,
|
|
533
|
+
root_prefix=full_root_prefix,
|
|
534
|
+
base_dir=new_dir_path,
|
|
535
|
+
serialization_format=self.serialization_format,
|
|
536
|
+
append_only=self.append_only,
|
|
537
|
+
base_class_for_values=self.base_class_for_values)
|
|
518
538
|
|
|
519
539
|
return new_dict
|
|
520
540
|
|
|
521
541
|
|
|
522
|
-
def timestamp(self, key:
|
|
542
|
+
def timestamp(self, key: NonEmptyPersiDictKey) -> float:
|
|
523
543
|
"""Get the last modification timestamp for a key.
|
|
524
544
|
|
|
525
545
|
This method is not part of the standard Python dictionary interface.
|
|
@@ -535,10 +555,11 @@ class S3Dict(PersiDict):
|
|
|
535
555
|
Raises:
|
|
536
556
|
KeyError: If the key does not exist in S3.
|
|
537
557
|
"""
|
|
538
|
-
key =
|
|
558
|
+
key = NonEmptySafeStrTuple(key)
|
|
539
559
|
obj_name = self._build_full_objectname(key)
|
|
540
560
|
response = self.s3_client.head_object(Bucket=self.bucket_name, Key=obj_name)
|
|
541
561
|
return response["LastModified"].timestamp()
|
|
542
562
|
|
|
543
563
|
|
|
544
|
-
parameterizable.register_parameterizable_class(
|
|
564
|
+
parameterizable.register_parameterizable_class(S3Dict_Legacy)
|
|
565
|
+
|