nucliadb-utils 6.9.7.post5482__py3-none-any.whl → 6.10.0.post5689__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-utils might be problematic. Click here for more details.
- nucliadb_utils/asyncio_utils.py +3 -3
- nucliadb_utils/audit/audit.py +32 -22
- nucliadb_utils/audit/basic.py +22 -23
- nucliadb_utils/audit/stream.py +31 -31
- nucliadb_utils/authentication.py +8 -10
- nucliadb_utils/cache/nats.py +10 -12
- nucliadb_utils/cache/pubsub.py +5 -4
- nucliadb_utils/cache/settings.py +2 -3
- nucliadb_utils/debug.py +2 -2
- nucliadb_utils/encryption/settings.py +1 -2
- nucliadb_utils/fastapi/openapi.py +1 -2
- nucliadb_utils/fastapi/versioning.py +10 -6
- nucliadb_utils/featureflagging.py +7 -4
- nucliadb_utils/grpc.py +3 -3
- nucliadb_utils/helpers.py +1 -1
- nucliadb_utils/nats.py +15 -16
- nucliadb_utils/nuclia_usage/utils/kb_usage_report.py +4 -5
- nucliadb_utils/run.py +1 -1
- nucliadb_utils/settings.py +40 -41
- nucliadb_utils/signals.py +3 -3
- nucliadb_utils/storages/azure.py +18 -18
- nucliadb_utils/storages/gcs.py +22 -21
- nucliadb_utils/storages/local.py +8 -8
- nucliadb_utils/storages/nuclia.py +1 -2
- nucliadb_utils/storages/object_store.py +6 -6
- nucliadb_utils/storages/s3.py +22 -22
- nucliadb_utils/storages/settings.py +7 -8
- nucliadb_utils/storages/storage.py +29 -45
- nucliadb_utils/storages/utils.py +2 -3
- nucliadb_utils/store.py +2 -2
- nucliadb_utils/tests/asyncbenchmark.py +8 -10
- nucliadb_utils/tests/azure.py +2 -1
- nucliadb_utils/tests/fixtures.py +3 -2
- nucliadb_utils/tests/gcs.py +3 -2
- nucliadb_utils/tests/local.py +2 -1
- nucliadb_utils/tests/nats.py +1 -1
- nucliadb_utils/tests/s3.py +2 -1
- nucliadb_utils/transaction.py +16 -18
- nucliadb_utils/utilities.py +22 -24
- {nucliadb_utils-6.9.7.post5482.dist-info → nucliadb_utils-6.10.0.post5689.dist-info}/METADATA +5 -5
- nucliadb_utils-6.10.0.post5689.dist-info/RECORD +59 -0
- nucliadb_utils-6.9.7.post5482.dist-info/RECORD +0 -59
- {nucliadb_utils-6.9.7.post5482.dist-info → nucliadb_utils-6.10.0.post5689.dist-info}/WHEEL +0 -0
- {nucliadb_utils-6.9.7.post5482.dist-info → nucliadb_utils-6.10.0.post5689.dist-info}/top_level.txt +0 -0
nucliadb_utils/storages/gcs.py
CHANGED
|
@@ -23,10 +23,11 @@ import asyncio
|
|
|
23
23
|
import base64
|
|
24
24
|
import json
|
|
25
25
|
import socket
|
|
26
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
26
27
|
from concurrent.futures import ThreadPoolExecutor
|
|
27
28
|
from copy import deepcopy
|
|
28
29
|
from datetime import datetime
|
|
29
|
-
from typing import Any,
|
|
30
|
+
from typing import Any, cast
|
|
30
31
|
from urllib.parse import quote_plus
|
|
31
32
|
|
|
32
33
|
import aiohttp
|
|
@@ -153,7 +154,7 @@ class GCSStorageField(StorageField):
|
|
|
153
154
|
assert data["resource"]["name"] == destination_uri
|
|
154
155
|
|
|
155
156
|
@storage_ops_observer.wrap({"type": "iter_data"})
|
|
156
|
-
async def iter_data(self, range:
|
|
157
|
+
async def iter_data(self, range: Range | None = None) -> AsyncGenerator[bytes]:
|
|
157
158
|
attempt = 1
|
|
158
159
|
while True:
|
|
159
160
|
try:
|
|
@@ -170,13 +171,13 @@ class GCSStorageField(StorageField):
|
|
|
170
171
|
raise
|
|
171
172
|
wait_time = 2 ** (attempt - 1)
|
|
172
173
|
logger.warning(
|
|
173
|
-
f"Error downloading from GCP. Retrying ({attempt} of {MAX_TRIES}) after {wait_time} seconds. Error: {ex}"
|
|
174
|
+
f"Error downloading from GCP. Retrying ({attempt} of {MAX_TRIES}) after {wait_time} seconds. Error: {ex}"
|
|
174
175
|
)
|
|
175
176
|
await asyncio.sleep(wait_time)
|
|
176
177
|
attempt += 1
|
|
177
178
|
|
|
178
179
|
@storage_ops_observer.wrap({"type": "inner_iter_data"})
|
|
179
|
-
async def _inner_iter_data(self, range:
|
|
180
|
+
async def _inner_iter_data(self, range: Range | None = None):
|
|
180
181
|
"""
|
|
181
182
|
Iterate through object data.
|
|
182
183
|
"""
|
|
@@ -322,7 +323,7 @@ class GCSStorageField(StorageField):
|
|
|
322
323
|
async with self.storage.session.put(
|
|
323
324
|
self.field.resumable_uri, headers=headers, data=data
|
|
324
325
|
) as call:
|
|
325
|
-
text = await call.text()
|
|
326
|
+
text = await call.text()
|
|
326
327
|
if call.status not in [200, 201, 308]:
|
|
327
328
|
if call.status == 410:
|
|
328
329
|
raise ResumableUploadGone(text)
|
|
@@ -377,7 +378,7 @@ class GCSStorageField(StorageField):
|
|
|
377
378
|
max_tries=MAX_TRIES,
|
|
378
379
|
)
|
|
379
380
|
@storage_ops_observer.wrap({"type": "exists"})
|
|
380
|
-
async def exists(self) ->
|
|
381
|
+
async def exists(self) -> ObjectMetadata | None:
|
|
381
382
|
"""
|
|
382
383
|
Existence can be checked either with a CloudFile data in the field attribute
|
|
383
384
|
or own StorageField key and bucket. Field takes precendece
|
|
@@ -425,23 +426,23 @@ class GCSStorageField(StorageField):
|
|
|
425
426
|
|
|
426
427
|
class GCSStorage(Storage):
|
|
427
428
|
field_klass = GCSStorageField
|
|
428
|
-
_session:
|
|
429
|
+
_session: aiohttp.ClientSession | None = None
|
|
429
430
|
_credentials = None
|
|
430
431
|
_json_credentials = None
|
|
431
432
|
chunk_size = CHUNK_SIZE
|
|
432
433
|
|
|
433
434
|
def __init__(
|
|
434
435
|
self,
|
|
435
|
-
account_credentials:
|
|
436
|
-
bucket:
|
|
437
|
-
location:
|
|
438
|
-
project:
|
|
439
|
-
executor:
|
|
440
|
-
deadletter_bucket:
|
|
441
|
-
indexing_bucket:
|
|
442
|
-
labels:
|
|
436
|
+
account_credentials: str | None = None,
|
|
437
|
+
bucket: str | None = None,
|
|
438
|
+
location: str | None = None,
|
|
439
|
+
project: str | None = None,
|
|
440
|
+
executor: ThreadPoolExecutor | None = None,
|
|
441
|
+
deadletter_bucket: str | None = None,
|
|
442
|
+
indexing_bucket: str | None = None,
|
|
443
|
+
labels: dict[str, str] | None = None,
|
|
443
444
|
url: str = "https://www.googleapis.com",
|
|
444
|
-
scopes:
|
|
445
|
+
scopes: list[str] | None = None,
|
|
445
446
|
anonymous: bool = False,
|
|
446
447
|
):
|
|
447
448
|
if anonymous:
|
|
@@ -533,7 +534,7 @@ class GCSStorage(Storage):
|
|
|
533
534
|
@storage_ops_observer.wrap({"type": "delete"})
|
|
534
535
|
async def delete_upload(self, uri: str, bucket_name: str):
|
|
535
536
|
if uri:
|
|
536
|
-
url = "{}/{}/o/{
|
|
537
|
+
url = f"{self.object_base_url}/{bucket_name}/o/{quote_plus(uri)}"
|
|
537
538
|
headers = await self.get_access_headers()
|
|
538
539
|
async with self.session.delete(url, headers=headers) as resp:
|
|
539
540
|
if resp.status in (200, 204, 404):
|
|
@@ -569,7 +570,7 @@ class GCSStorage(Storage):
|
|
|
569
570
|
max_tries=MAX_TRIES,
|
|
570
571
|
)
|
|
571
572
|
@storage_ops_observer.wrap({"type": "create_bucket"})
|
|
572
|
-
async def create_bucket(self, bucket_name: str, kbid:
|
|
573
|
+
async def create_bucket(self, bucket_name: str, kbid: str | None = None):
|
|
573
574
|
if await self.check_exists(bucket_name=bucket_name):
|
|
574
575
|
return
|
|
575
576
|
|
|
@@ -671,9 +672,9 @@ class GCSStorage(Storage):
|
|
|
671
672
|
return deleted, conflict
|
|
672
673
|
|
|
673
674
|
async def iterate_objects(
|
|
674
|
-
self, bucket: str, prefix: str, start:
|
|
675
|
-
) -> AsyncGenerator[ObjectInfo
|
|
676
|
-
url = "{}/{}/o"
|
|
675
|
+
self, bucket: str, prefix: str, start: str | None = None
|
|
676
|
+
) -> AsyncGenerator[ObjectInfo]:
|
|
677
|
+
url = f"{self.object_base_url}/{bucket}/o"
|
|
677
678
|
headers = await self.get_access_headers()
|
|
678
679
|
params = {"prefix": prefix}
|
|
679
680
|
if start:
|
nucliadb_utils/storages/local.py
CHANGED
|
@@ -23,8 +23,8 @@ import glob
|
|
|
23
23
|
import json
|
|
24
24
|
import os
|
|
25
25
|
import shutil
|
|
26
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
26
27
|
from datetime import datetime
|
|
27
|
-
from typing import AsyncGenerator, AsyncIterator, Optional
|
|
28
28
|
|
|
29
29
|
import aiofiles
|
|
30
30
|
|
|
@@ -38,7 +38,7 @@ class LocalStorageField(StorageField):
|
|
|
38
38
|
storage: LocalStorage
|
|
39
39
|
_handler = None
|
|
40
40
|
|
|
41
|
-
def metadata_key(self, uri:
|
|
41
|
+
def metadata_key(self, uri: str | None = None):
|
|
42
42
|
if uri is None and self.field is not None:
|
|
43
43
|
return f"{self.field.uri}.metadata"
|
|
44
44
|
elif uri is None and self.key is not None:
|
|
@@ -73,7 +73,7 @@ class LocalStorageField(StorageField):
|
|
|
73
73
|
destination_path = f"{destination_bucket_path}/{destination_uri}"
|
|
74
74
|
shutil.copy(origin_path, destination_path)
|
|
75
75
|
|
|
76
|
-
async def iter_data(self, range:
|
|
76
|
+
async def iter_data(self, range: Range | None = None) -> AsyncGenerator[bytes]:
|
|
77
77
|
range = range or Range()
|
|
78
78
|
key = self.field.uri if self.field else self.key
|
|
79
79
|
if self.field is None:
|
|
@@ -191,7 +191,7 @@ class LocalStorageField(StorageField):
|
|
|
191
191
|
self.field.ClearField("offset")
|
|
192
192
|
self.field.ClearField("upload_uri")
|
|
193
193
|
|
|
194
|
-
async def exists(self) ->
|
|
194
|
+
async def exists(self) -> ObjectMetadata | None:
|
|
195
195
|
file_path = self.storage.get_file_path(self.bucket, self.key)
|
|
196
196
|
metadata_path = self.metadata_key(file_path)
|
|
197
197
|
if os.path.exists(metadata_path):
|
|
@@ -218,7 +218,7 @@ class LocalStorage(Storage):
|
|
|
218
218
|
field_klass = LocalStorageField
|
|
219
219
|
chunk_size = CHUNK_SIZE
|
|
220
220
|
|
|
221
|
-
def __init__(self, local_testing_files: str, indexing_bucket:
|
|
221
|
+
def __init__(self, local_testing_files: str, indexing_bucket: str | None = None):
|
|
222
222
|
self.local_testing_files = local_testing_files.rstrip("/")
|
|
223
223
|
self.bucket_format = "ndb_{kbid}"
|
|
224
224
|
self.source = CloudFile.LOCAL
|
|
@@ -281,8 +281,8 @@ class LocalStorage(Storage):
|
|
|
281
281
|
return deleted
|
|
282
282
|
|
|
283
283
|
async def iterate_objects(
|
|
284
|
-
self, bucket: str, prefix: str, start:
|
|
285
|
-
) -> AsyncGenerator[ObjectInfo
|
|
284
|
+
self, bucket: str, prefix: str, start: str | None = None
|
|
285
|
+
) -> AsyncGenerator[ObjectInfo]:
|
|
286
286
|
bucket_path = self.get_bucket_path(bucket)
|
|
287
287
|
pathname = f"{self.get_file_path(bucket, prefix)}**/*"
|
|
288
288
|
for key in sorted(glob.glob(pathname, recursive=True)):
|
|
@@ -296,7 +296,7 @@ class LocalStorage(Storage):
|
|
|
296
296
|
continue
|
|
297
297
|
yield ObjectInfo(name=name)
|
|
298
298
|
|
|
299
|
-
async def download(self, bucket: str, key: str, range:
|
|
299
|
+
async def download(self, bucket: str, key: str, range: Range | None = None):
|
|
300
300
|
key_path = self.get_file_path(bucket, key)
|
|
301
301
|
if not os.path.exists(key_path):
|
|
302
302
|
return
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
import aiohttp
|
|
23
22
|
|
|
@@ -32,7 +31,7 @@ class NucliaStorage:
|
|
|
32
31
|
self,
|
|
33
32
|
nuclia_public_url: str,
|
|
34
33
|
nuclia_zone: str,
|
|
35
|
-
service_account:
|
|
34
|
+
service_account: str | None = None,
|
|
36
35
|
):
|
|
37
36
|
self.service_account = service_account
|
|
38
37
|
self.nuclia_public_url = nuclia_public_url.format(zone=nuclia_zone)
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
import abc
|
|
22
|
-
from
|
|
22
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
23
23
|
|
|
24
24
|
from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
|
|
25
25
|
|
|
@@ -37,7 +37,7 @@ class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
37
37
|
async def finalize(self) -> None: ...
|
|
38
38
|
|
|
39
39
|
@abc.abstractmethod
|
|
40
|
-
async def bucket_create(self, bucket: str, labels:
|
|
40
|
+
async def bucket_create(self, bucket: str, labels: dict[str, str] | None = None) -> bool:
|
|
41
41
|
"""
|
|
42
42
|
Create a new bucket in the object storage. Labels the bucket with the given labels if provided.
|
|
43
43
|
Returns True if the bucket was created, False if it already existed.
|
|
@@ -93,7 +93,7 @@ class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
93
93
|
self,
|
|
94
94
|
bucket: str,
|
|
95
95
|
key: str,
|
|
96
|
-
data:
|
|
96
|
+
data: bytes | AsyncGenerator[bytes, None],
|
|
97
97
|
metadata: ObjectMetadata,
|
|
98
98
|
) -> None: ...
|
|
99
99
|
|
|
@@ -114,14 +114,14 @@ class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
114
114
|
|
|
115
115
|
@abc.abstractmethod
|
|
116
116
|
async def download_stream(
|
|
117
|
-
self, bucket: str, key: str, range:
|
|
117
|
+
self, bucket: str, key: str, range: Range | None = None
|
|
118
118
|
) -> AsyncGenerator[bytes, None]:
|
|
119
119
|
raise NotImplementedError()
|
|
120
120
|
yield b""
|
|
121
121
|
|
|
122
122
|
@abc.abstractmethod
|
|
123
123
|
async def iterate(
|
|
124
|
-
self, bucket: str, prefix: str, start:
|
|
124
|
+
self, bucket: str, prefix: str, start: str | None = None
|
|
125
125
|
) -> AsyncGenerator[ObjectInfo, None]:
|
|
126
126
|
raise NotImplementedError()
|
|
127
127
|
yield ObjectInfo(name="")
|
|
@@ -132,7 +132,7 @@ class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
132
132
|
@abc.abstractmethod
|
|
133
133
|
async def upload_multipart_start(
|
|
134
134
|
self, bucket: str, key: str, metadata: ObjectMetadata
|
|
135
|
-
) ->
|
|
135
|
+
) -> str | None:
|
|
136
136
|
"""
|
|
137
137
|
Start a multipart upload. May return the url for the resumable upload.
|
|
138
138
|
"""
|
nucliadb_utils/storages/s3.py
CHANGED
|
@@ -20,9 +20,9 @@
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
22
|
import base64
|
|
23
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
23
24
|
from contextlib import AsyncExitStack
|
|
24
25
|
from datetime import datetime
|
|
25
|
-
from typing import AsyncGenerator, AsyncIterator, Optional
|
|
26
26
|
|
|
27
27
|
import aiobotocore # type: ignore
|
|
28
28
|
import aiohttp
|
|
@@ -55,12 +55,12 @@ RETRIABLE_EXCEPTIONS = (
|
|
|
55
55
|
POLICY_DELETE = {
|
|
56
56
|
"Rules": [
|
|
57
57
|
{
|
|
58
|
-
"Expiration": {"Days":
|
|
58
|
+
"Expiration": {"Days": 7},
|
|
59
59
|
"ID": "FullDelete",
|
|
60
60
|
"Filter": {"Prefix": ""},
|
|
61
61
|
"Status": "Enabled",
|
|
62
|
-
"NoncurrentVersionExpiration": {"NoncurrentDays":
|
|
63
|
-
"AbortIncompleteMultipartUpload": {"DaysAfterInitiation":
|
|
62
|
+
"NoncurrentVersionExpiration": {"NoncurrentDays": 7},
|
|
63
|
+
"AbortIncompleteMultipartUpload": {"DaysAfterInitiation": 7},
|
|
64
64
|
},
|
|
65
65
|
{
|
|
66
66
|
"Expiration": {"ExpiredObjectDeleteMarker": True},
|
|
@@ -86,7 +86,7 @@ class S3StorageField(StorageField):
|
|
|
86
86
|
self,
|
|
87
87
|
uri,
|
|
88
88
|
bucket,
|
|
89
|
-
range:
|
|
89
|
+
range: Range | None = None,
|
|
90
90
|
):
|
|
91
91
|
range = range or Range()
|
|
92
92
|
if range.any():
|
|
@@ -103,7 +103,7 @@ class S3StorageField(StorageField):
|
|
|
103
103
|
raise
|
|
104
104
|
|
|
105
105
|
@s3_ops_observer.wrap({"type": "iter_data"})
|
|
106
|
-
async def iter_data(self, range:
|
|
106
|
+
async def iter_data(self, range: Range | None = None) -> AsyncGenerator[bytes]:
|
|
107
107
|
# Suports field and key based iter
|
|
108
108
|
uri = self.field.uri if self.field else self.key
|
|
109
109
|
if self.field is None:
|
|
@@ -277,7 +277,7 @@ class S3StorageField(StorageField):
|
|
|
277
277
|
)
|
|
278
278
|
|
|
279
279
|
@s3_ops_observer.wrap({"type": "exists"})
|
|
280
|
-
async def exists(self) ->
|
|
280
|
+
async def exists(self) -> ObjectMetadata | None:
|
|
281
281
|
"""
|
|
282
282
|
Existence can be checked either with a CloudFile data in the field attribute
|
|
283
283
|
or own StorageField key and bucket. Field takes precendece
|
|
@@ -348,18 +348,18 @@ class S3Storage(Storage):
|
|
|
348
348
|
|
|
349
349
|
def __init__(
|
|
350
350
|
self,
|
|
351
|
-
aws_client_id:
|
|
352
|
-
aws_client_secret:
|
|
353
|
-
deadletter_bucket:
|
|
354
|
-
indexing_bucket:
|
|
355
|
-
endpoint_url:
|
|
351
|
+
aws_client_id: str | None = None,
|
|
352
|
+
aws_client_secret: str | None = None,
|
|
353
|
+
deadletter_bucket: str | None = None,
|
|
354
|
+
indexing_bucket: str | None = None,
|
|
355
|
+
endpoint_url: str | None = None,
|
|
356
356
|
verify_ssl: bool = True,
|
|
357
357
|
use_ssl: bool = True,
|
|
358
|
-
region_name:
|
|
359
|
-
kms_key_id:
|
|
358
|
+
region_name: str | None = None,
|
|
359
|
+
kms_key_id: str | None = None,
|
|
360
360
|
max_pool_connections: int = 30,
|
|
361
|
-
bucket:
|
|
362
|
-
bucket_tags:
|
|
361
|
+
bucket: str | None = None,
|
|
362
|
+
bucket_tags: dict[str, str] | None = None,
|
|
363
363
|
):
|
|
364
364
|
self.source = CloudFile.S3
|
|
365
365
|
self.deadletter_bucket = deadletter_bucket
|
|
@@ -394,7 +394,7 @@ class S3Storage(Storage):
|
|
|
394
394
|
self._session = get_session()
|
|
395
395
|
return self._session
|
|
396
396
|
|
|
397
|
-
async def initialize(self:
|
|
397
|
+
async def initialize(self: S3Storage) -> None:
|
|
398
398
|
session = AioSession()
|
|
399
399
|
self._s3aioclient: AioBaseClient = await self._exit_stack.enter_async_context(
|
|
400
400
|
session.create_client("s3", **self.opts)
|
|
@@ -425,8 +425,8 @@ class S3Storage(Storage):
|
|
|
425
425
|
raise AttributeError("No valid uri")
|
|
426
426
|
|
|
427
427
|
async def iterate_objects(
|
|
428
|
-
self, bucket: str, prefix: str = "/", start:
|
|
429
|
-
) -> AsyncGenerator[ObjectInfo
|
|
428
|
+
self, bucket: str, prefix: str = "/", start: str | None = None
|
|
429
|
+
) -> AsyncGenerator[ObjectInfo]:
|
|
430
430
|
paginator = self._s3aioclient.get_paginator("list_objects")
|
|
431
431
|
async for result in paginator.paginate(
|
|
432
432
|
Bucket=bucket, Prefix=prefix, PaginationConfig={"StartingToken": start}
|
|
@@ -531,9 +531,9 @@ async def bucket_exists(client: AioSession, bucket_name: str) -> bool:
|
|
|
531
531
|
async def create_bucket(
|
|
532
532
|
client: AioSession,
|
|
533
533
|
bucket_name: str,
|
|
534
|
-
bucket_tags:
|
|
535
|
-
region_name:
|
|
536
|
-
kms_key_id:
|
|
534
|
+
bucket_tags: dict[str, str] | None = None,
|
|
535
|
+
region_name: str | None = None,
|
|
536
|
+
kms_key_id: str | None = None,
|
|
537
537
|
):
|
|
538
538
|
bucket_creation_options = {}
|
|
539
539
|
if region_name is not None:
|
|
@@ -18,23 +18,22 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
|
|
20
20
|
import os
|
|
21
|
-
from typing import Dict, Optional
|
|
22
21
|
|
|
23
22
|
from pydantic_settings import BaseSettings
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
class Settings(BaseSettings):
|
|
27
|
-
gcs_deadletter_bucket:
|
|
28
|
-
gcs_indexing_bucket:
|
|
26
|
+
gcs_deadletter_bucket: str | None = None
|
|
27
|
+
gcs_indexing_bucket: str | None = None
|
|
29
28
|
|
|
30
29
|
gcs_threads: int = 3
|
|
31
|
-
gcs_labels:
|
|
30
|
+
gcs_labels: dict[str, str] = {}
|
|
32
31
|
|
|
33
|
-
s3_deadletter_bucket:
|
|
34
|
-
s3_indexing_bucket:
|
|
32
|
+
s3_deadletter_bucket: str | None = None
|
|
33
|
+
s3_indexing_bucket: str | None = None
|
|
35
34
|
|
|
36
|
-
azure_deadletter_bucket:
|
|
37
|
-
azure_indexing_bucket:
|
|
35
|
+
azure_deadletter_bucket: str | None = None
|
|
36
|
+
azure_indexing_bucket: str | None = None
|
|
38
37
|
|
|
39
38
|
local_testing_files: str = os.path.dirname(__file__)
|
|
40
39
|
|
|
@@ -24,15 +24,10 @@ import asyncio
|
|
|
24
24
|
import base64
|
|
25
25
|
import hashlib
|
|
26
26
|
import uuid
|
|
27
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
27
28
|
from io import BytesIO
|
|
28
29
|
from typing import (
|
|
29
30
|
Any,
|
|
30
|
-
AsyncGenerator,
|
|
31
|
-
AsyncIterator,
|
|
32
|
-
List,
|
|
33
|
-
Optional,
|
|
34
|
-
Type,
|
|
35
|
-
Union,
|
|
36
31
|
cast,
|
|
37
32
|
)
|
|
38
33
|
|
|
@@ -65,14 +60,14 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
65
60
|
storage: Storage
|
|
66
61
|
bucket: str
|
|
67
62
|
key: str
|
|
68
|
-
field:
|
|
63
|
+
field: CloudFile | None = None
|
|
69
64
|
|
|
70
65
|
def __init__(
|
|
71
66
|
self,
|
|
72
67
|
storage: Storage,
|
|
73
68
|
bucket: str,
|
|
74
69
|
fullkey: str,
|
|
75
|
-
field:
|
|
70
|
+
field: CloudFile | None = None,
|
|
76
71
|
):
|
|
77
72
|
self.storage = storage
|
|
78
73
|
self.bucket = bucket
|
|
@@ -83,7 +78,7 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
83
78
|
async def upload(self, iterator: AsyncIterator, origin: CloudFile) -> CloudFile: ...
|
|
84
79
|
|
|
85
80
|
@abc.abstractmethod
|
|
86
|
-
async def iter_data(self, range:
|
|
81
|
+
async def iter_data(self, range: Range | None = None) -> AsyncGenerator[bytes]:
|
|
87
82
|
raise NotImplementedError()
|
|
88
83
|
yield b""
|
|
89
84
|
|
|
@@ -95,7 +90,7 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
95
90
|
return deleted
|
|
96
91
|
|
|
97
92
|
@abc.abstractmethod
|
|
98
|
-
async def exists(self) ->
|
|
93
|
+
async def exists(self) -> ObjectMetadata | None: ...
|
|
99
94
|
|
|
100
95
|
@abc.abstractmethod
|
|
101
96
|
async def copy(
|
|
@@ -130,10 +125,9 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
130
125
|
|
|
131
126
|
class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
132
127
|
source: int
|
|
133
|
-
field_klass:
|
|
134
|
-
deadletter_bucket:
|
|
135
|
-
indexing_bucket:
|
|
136
|
-
cached_buckets: List[str] = []
|
|
128
|
+
field_klass: type
|
|
129
|
+
deadletter_bucket: str | None = None
|
|
130
|
+
indexing_bucket: str | None = None
|
|
137
131
|
chunk_size = CHUNK_SIZE
|
|
138
132
|
|
|
139
133
|
async def delete_resource(self, kbid: str, uuid: str):
|
|
@@ -166,7 +160,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
166
160
|
await self.upload_object(self.deadletter_bucket, key, message.SerializeToString())
|
|
167
161
|
|
|
168
162
|
def get_indexing_storage_key(
|
|
169
|
-
self, *, kb: str, logical_shard: str, resource_uid: str, txid:
|
|
163
|
+
self, *, kb: str, logical_shard: str, resource_uid: str, txid: int | str
|
|
170
164
|
):
|
|
171
165
|
return INDEXING_KEY.format(kb=kb, shard=logical_shard, resource=resource_uid, txid=txid)
|
|
172
166
|
|
|
@@ -174,7 +168,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
174
168
|
self,
|
|
175
169
|
message: BrainResource,
|
|
176
170
|
txid: int,
|
|
177
|
-
partition:
|
|
171
|
+
partition: str | None,
|
|
178
172
|
kb: str,
|
|
179
173
|
logical_shard: str,
|
|
180
174
|
) -> str:
|
|
@@ -197,7 +191,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
197
191
|
self,
|
|
198
192
|
message: BrainResource,
|
|
199
193
|
reindex_id: str,
|
|
200
|
-
partition:
|
|
194
|
+
partition: str | None,
|
|
201
195
|
kb: str,
|
|
202
196
|
logical_shard: str,
|
|
203
197
|
) -> str:
|
|
@@ -328,7 +322,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
328
322
|
kbid: str,
|
|
329
323
|
uuid: str,
|
|
330
324
|
field: str,
|
|
331
|
-
old_field:
|
|
325
|
+
old_field: CloudFile | None = None,
|
|
332
326
|
) -> StorageField:
|
|
333
327
|
# Its a file field value
|
|
334
328
|
bucket = self.get_bucket_name(kbid)
|
|
@@ -360,7 +354,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
360
354
|
payload: bytes,
|
|
361
355
|
filename: str,
|
|
362
356
|
content_type: str,
|
|
363
|
-
md5:
|
|
357
|
+
md5: str | None = None,
|
|
364
358
|
):
|
|
365
359
|
decoded_payload = base64.b64decode(payload)
|
|
366
360
|
cf = CloudFile()
|
|
@@ -370,7 +364,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
370
364
|
cf.source = self.source # type: ignore
|
|
371
365
|
|
|
372
366
|
if md5 is None:
|
|
373
|
-
md5hash = hashlib.md5(decoded_payload).digest()
|
|
367
|
+
md5hash = hashlib.md5(decoded_payload, usedforsecurity=False).digest()
|
|
374
368
|
cf.md5 = md5hash.decode()
|
|
375
369
|
else:
|
|
376
370
|
cf.md5 = md5
|
|
@@ -436,20 +430,19 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
436
430
|
self,
|
|
437
431
|
bucket: str,
|
|
438
432
|
key: str,
|
|
439
|
-
range:
|
|
440
|
-
):
|
|
433
|
+
range: Range | None = None,
|
|
434
|
+
) -> AsyncGenerator[bytes]:
|
|
441
435
|
destination: StorageField = self.field_klass(storage=self, bucket=bucket, fullkey=key)
|
|
442
436
|
try:
|
|
443
437
|
async for data in destination.iter_data(range=range):
|
|
444
438
|
yield data
|
|
445
439
|
except KeyError:
|
|
446
|
-
|
|
440
|
+
pass
|
|
447
441
|
|
|
448
442
|
async def downloadbytes(self, bucket: str, key: str) -> BytesIO:
|
|
449
443
|
result = BytesIO()
|
|
450
444
|
async for data in self.download(bucket, key):
|
|
451
|
-
|
|
452
|
-
result.write(data)
|
|
445
|
+
result.write(data)
|
|
453
446
|
|
|
454
447
|
result.seek(0)
|
|
455
448
|
return result
|
|
@@ -461,29 +454,24 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
461
454
|
result.seek(0)
|
|
462
455
|
return result
|
|
463
456
|
|
|
464
|
-
async def downloadbytescf_iterator(
|
|
465
|
-
self, cf: CloudFile
|
|
466
|
-
) -> AsyncGenerator[bytes, None]: # pragma: no cover
|
|
457
|
+
async def downloadbytescf_iterator(self, cf: CloudFile) -> AsyncGenerator[bytes]: # pragma: no cover
|
|
467
458
|
# this is covered by other tests
|
|
468
459
|
if cf.source == self.source:
|
|
469
460
|
async for data in self.download(cf.bucket_name, cf.uri):
|
|
470
|
-
|
|
471
|
-
yield data
|
|
461
|
+
yield data
|
|
472
462
|
elif cf.source == CloudFile.FLAPS:
|
|
473
463
|
flaps_storage = await get_nuclia_storage()
|
|
474
464
|
async for data in flaps_storage.download(cf):
|
|
475
|
-
|
|
476
|
-
yield data
|
|
465
|
+
yield data
|
|
477
466
|
elif cf.source == CloudFile.LOCAL:
|
|
478
467
|
local_storage = get_local_storage()
|
|
479
468
|
async for data in local_storage.download(cf.bucket_name, cf.uri):
|
|
480
|
-
|
|
481
|
-
yield data
|
|
469
|
+
yield data
|
|
482
470
|
|
|
483
471
|
async def upload_pb(self, sf: StorageField, payload: Any):
|
|
484
472
|
await self.upload_object(sf.bucket, sf.key, payload.SerializeToString())
|
|
485
473
|
|
|
486
|
-
async def download_pb(self, sf: StorageField, PBKlass:
|
|
474
|
+
async def download_pb(self, sf: StorageField, PBKlass: type):
|
|
487
475
|
payload = await self.downloadbytes(sf.bucket, sf.key)
|
|
488
476
|
|
|
489
477
|
if payload.getbuffer().nbytes == 0:
|
|
@@ -517,8 +505,8 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
517
505
|
|
|
518
506
|
@abc.abstractmethod
|
|
519
507
|
async def iterate_objects(
|
|
520
|
-
self, bucket: str, prefix: str, start:
|
|
521
|
-
) -> AsyncGenerator[ObjectInfo
|
|
508
|
+
self, bucket: str, prefix: str, start: str | None = None
|
|
509
|
+
) -> AsyncGenerator[ObjectInfo]:
|
|
522
510
|
raise NotImplementedError()
|
|
523
511
|
yield ObjectInfo(name="")
|
|
524
512
|
|
|
@@ -577,9 +565,7 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
|
|
|
577
565
|
...
|
|
578
566
|
|
|
579
567
|
|
|
580
|
-
async def iter_and_add_size(
|
|
581
|
-
stream: AsyncGenerator[bytes, None], cf: CloudFile
|
|
582
|
-
) -> AsyncGenerator[bytes, None]:
|
|
568
|
+
async def iter_and_add_size(stream: AsyncGenerator[bytes], cf: CloudFile) -> AsyncGenerator[bytes]:
|
|
583
569
|
# This is needed because some storage types like GCS or S3 require
|
|
584
570
|
# the size of the file at least at the request done for the last chunk.
|
|
585
571
|
total_size = 0
|
|
@@ -590,9 +576,7 @@ async def iter_and_add_size(
|
|
|
590
576
|
yield chunk
|
|
591
577
|
|
|
592
578
|
|
|
593
|
-
async def iter_in_chunk_size(
|
|
594
|
-
iterator: AsyncGenerator[bytes, None], chunk_size: int
|
|
595
|
-
) -> AsyncGenerator[bytes, None]:
|
|
579
|
+
async def iter_in_chunk_size(iterator: AsyncGenerator[bytes], chunk_size: int) -> AsyncGenerator[bytes]:
|
|
596
580
|
# This is needed to make sure bytes uploaded to the blob storage complies with a particular chunk size.
|
|
597
581
|
buffer = b""
|
|
598
582
|
async for chunk in iterator:
|
|
@@ -606,8 +590,8 @@ async def iter_in_chunk_size(
|
|
|
606
590
|
|
|
607
591
|
|
|
608
592
|
async def iterate_storage_compatible(
|
|
609
|
-
iterator: AsyncGenerator[bytes
|
|
610
|
-
) -> AsyncGenerator[bytes
|
|
593
|
+
iterator: AsyncGenerator[bytes], storage: Storage, cf: CloudFile
|
|
594
|
+
) -> AsyncGenerator[bytes]:
|
|
611
595
|
"""
|
|
612
596
|
Makes sure to add the size to the cloudfile and split the data in
|
|
613
597
|
chunks that are compatible with the storage type of choice
|
nucliadb_utils/storages/utils.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
from dataclasses import dataclass
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from pydantic import BaseModel
|
|
25
24
|
|
|
@@ -41,8 +40,8 @@ class Range:
|
|
|
41
40
|
The start and end values are 0-based.
|
|
42
41
|
"""
|
|
43
42
|
|
|
44
|
-
start:
|
|
45
|
-
end:
|
|
43
|
+
start: int | None = None
|
|
44
|
+
end: int | None = None
|
|
46
45
|
|
|
47
46
|
def any(self) -> bool:
|
|
48
47
|
return self.start is not None or self.end is not None
|
nucliadb_utils/store.py
CHANGED
|
@@ -17,6 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
|
-
MAIN:
|
|
22
|
+
MAIN: dict[str, Any] = {}
|