nucliadb-utils 4.0.3.post590__py3-none-any.whl → 4.0.3.post592__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,6 +52,7 @@ class FileBackendConfig(Enum):
52
52
  GCS = "gcs"
53
53
  S3 = "s3"
54
54
  LOCAL = "local"
55
+ AZURE = "azure"
55
56
  NOT_SET = "notset" # setting not provided
56
57
 
57
58
  @classmethod
@@ -113,8 +114,10 @@ class StorageSettings(BaseSettings):
113
114
  description="Number of days that uploaded files are kept in Nulia's processing engine",
114
115
  )
115
116
 
116
- driver_pg_url: Optional[str] = None # match same env var for k/v storage
117
- driver_pg_connection_pool_max_size: int = 20 # match same env var for k/v storage
117
+ azure_connection_string: Optional[str] = Field(
118
+ default=None,
119
+ description="Azure Storage connection string: https://docs.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string", # noqa
120
+ )
118
121
 
119
122
 
120
123
  storage_settings = StorageSettings()
@@ -0,0 +1,415 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+ from datetime import datetime
25
+ from typing import AsyncGenerator, AsyncIterator, Optional, Union
26
+
27
+ from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
28
+ from azure.storage.blob import BlobProperties, BlobType, ContentSettings
29
+ from azure.storage.blob.aio import BlobServiceClient
30
+
31
+ from nucliadb_protos.resources_pb2 import CloudFile
32
+ from nucliadb_utils.storages.exceptions import ObjectNotFoundError
33
+ from nucliadb_utils.storages.object_store import ObjectStore
34
+ from nucliadb_utils.storages.storage import Storage, StorageField
35
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class AzureStorageField(StorageField):
41
+ storage: AzureStorage
42
+
43
+ async def move(
44
+ self,
45
+ origin_uri: str,
46
+ destination_uri: str,
47
+ origin_bucket_name: str,
48
+ destination_bucket_name: str,
49
+ ):
50
+ await self.storage.object_store.move(
51
+ origin_bucket_name, origin_uri, destination_bucket_name, destination_uri
52
+ )
53
+
54
+ async def copy(
55
+ self,
56
+ origin_uri: str,
57
+ destination_uri: str,
58
+ origin_bucket_name: str,
59
+ destination_bucket_name: str,
60
+ ):
61
+ await self.storage.object_store.copy(
62
+ origin_bucket_name, origin_uri, destination_bucket_name, destination_uri
63
+ )
64
+
65
+ async def iter_data(self, range: Optional[Range] = None) -> AsyncGenerator[bytes, None]:
66
+ if self.field is not None:
67
+ bucket = self.field.bucket_name
68
+ key = self.field.uri
69
+ else:
70
+ bucket = self.bucket
71
+ key = self.key
72
+ async for chunk in self.storage.object_store.download_stream(bucket, key, range):
73
+ yield chunk
74
+
75
+ async def start(self, cf: CloudFile) -> CloudFile:
76
+ """Init an upload.
77
+
78
+ cf: New file to upload
79
+ """
80
+ if self.field is not None and self.field.upload_uri != "":
81
+ # If there is a temporal url, delete it
82
+ await self.storage.delete_upload(self.field.upload_uri, self.field.bucket_name)
83
+ if self.field is not None and self.field.uri != "":
84
+ field: CloudFile = CloudFile(
85
+ filename=cf.filename,
86
+ size=cf.size,
87
+ content_type=cf.content_type,
88
+ bucket_name=self.bucket,
89
+ md5=cf.md5,
90
+ source=CloudFile.AZURE,
91
+ old_uri=self.field.uri,
92
+ old_bucket=self.field.bucket_name,
93
+ )
94
+ upload_uri = f"{self.key}-{datetime.now().isoformat()}"
95
+ else:
96
+ field = CloudFile(
97
+ filename=cf.filename,
98
+ size=cf.size,
99
+ md5=cf.md5,
100
+ content_type=cf.content_type,
101
+ bucket_name=self.bucket,
102
+ source=CloudFile.AZURE,
103
+ )
104
+ upload_uri = self.key
105
+ await self.storage.object_store.upload_multipart_start(
106
+ self.bucket,
107
+ upload_uri,
108
+ ObjectMetadata(
109
+ filename=cf.filename,
110
+ size=cf.size,
111
+ content_type=cf.content_type,
112
+ ),
113
+ )
114
+ field.offset = 0
115
+ field.upload_uri = upload_uri
116
+ return field
117
+
118
+ async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int:
119
+ if self.field is None:
120
+ raise AttributeError()
121
+ return await self.storage.object_store.upload_multipart_append(
122
+ self.field.bucket_name, self.field.upload_uri, iterable
123
+ )
124
+
125
+ async def finish(self):
126
+ self.field.uri = self.key
127
+ self.field.ClearField("resumable_uri")
128
+ self.field.ClearField("offset")
129
+ self.field.ClearField("upload_uri")
130
+ self.field.ClearField("parts")
131
+
132
+ async def exists(self) -> Optional[ObjectMetadata]:
133
+ key = None
134
+ bucket = None
135
+ if self.field is not None and self.field.uri != "":
136
+ key = self.field.uri
137
+ bucket = self.field.bucket_name
138
+ elif self.key != "":
139
+ key = self.key
140
+ bucket = self.bucket
141
+ else:
142
+ return None
143
+ return await self.storage.object_store.get_metadata(bucket, key)
144
+
145
+ async def upload(self, iterator: AsyncIterator, origin: CloudFile) -> CloudFile:
146
+ self.field = await self.start(origin)
147
+ if self.field is None:
148
+ raise AttributeError()
149
+ await self.append(origin, iterator)
150
+ await self.finish()
151
+ return self.field
152
+
153
+ def __repr__(self):
154
+ return f"{self.storage.source}: {self.bucket}/{self.key}"
155
+
156
+
157
+ class AzureStorage(Storage):
158
+ field_klass = AzureStorageField
159
+ object_store: ObjectStore
160
+ source = CloudFile.AZURE
161
+
162
+ def __init__(
163
+ self,
164
+ connection_string: str,
165
+ deadletter_bucket: str = "deadletter",
166
+ indexing_bucket: str = "indexing",
167
+ ):
168
+ self.object_store = AzureObjectStore(connection_string)
169
+ self.deadletter_bucket = deadletter_bucket
170
+ self.indexing_bucket = indexing_bucket
171
+
172
+ async def initialize(self, service_name: Optional[str] = None):
173
+ await self.object_store.initialize()
174
+ for bucket in [
175
+ self.deadletter_bucket,
176
+ self.indexing_bucket,
177
+ ]:
178
+ if bucket is None or bucket == "":
179
+ continue
180
+ try:
181
+ await self.object_store.bucket_create(bucket)
182
+ except Exception:
183
+ logger.exception(f"Could not create bucket {bucket}", exc_info=True)
184
+
185
+ async def finalize(self):
186
+ await self.object_store.finalize()
187
+
188
+ async def delete_upload(self, uri: str, bucket_name: str):
189
+ await self.object_store.delete(bucket_name, uri)
190
+
191
+ async def create_bucket(self, bucket_name: str, kbid: Optional[str] = None):
192
+ if await self.object_store.bucket_exists(bucket_name):
193
+ return
194
+ await self.object_store.bucket_create(bucket_name)
195
+
196
+ def get_bucket_name(self, kbid: str):
197
+ return f"nucliadb-{kbid}"
198
+
199
+ async def create_kb(self, kbid: str) -> bool:
200
+ bucket_name = self.get_bucket_name(kbid)
201
+ return await self.object_store.bucket_create(bucket_name)
202
+
203
+ async def schedule_delete_kb(self, kbid: str) -> bool:
204
+ bucket_name = self.get_bucket_name(kbid)
205
+ deleted, _ = await self.object_store.bucket_delete(bucket_name)
206
+ return deleted
207
+
208
+ async def delete_kb(self, kbid: str) -> tuple[bool, bool]:
209
+ bucket_name = self.get_bucket_name(kbid)
210
+ return await self.object_store.bucket_delete(bucket_name)
211
+
212
+ async def iterate_objects(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
213
+ async for obj in self.object_store.iterate(bucket, prefix):
214
+ yield obj
215
+
216
+
217
+ class AzureObjectStore(ObjectStore):
218
+ def __init__(self, connection_string: str):
219
+ self.connection_string = connection_string
220
+ self._service_client: Optional[BlobServiceClient] = None
221
+
222
+ @property
223
+ def service_client(self) -> BlobServiceClient:
224
+ if self._service_client is None:
225
+ raise AttributeError("Service client not initialized")
226
+ return self._service_client
227
+
228
+ async def initialize(self):
229
+ self._service_client = BlobServiceClient.from_connection_string(self.connection_string)
230
+
231
+ async def finalize(self):
232
+ try:
233
+ if self._service_client is not None:
234
+ await self._service_client.close()
235
+ except Exception:
236
+ logger.warning("Error closing Azure client", exc_info=True)
237
+ self._service_client = None
238
+
239
+ async def bucket_create(self, bucket: str, labels: dict[str, str] | None = None) -> bool:
240
+ container_client = self.service_client.get_container_client(bucket)
241
+ try:
242
+ await container_client.create_container()
243
+ return True
244
+ except ResourceExistsError:
245
+ return False
246
+
247
+ async def bucket_delete(self, bucket: str) -> tuple[bool, bool]:
248
+ container_client = self.service_client.get_container_client(bucket)
249
+ # There's never a conflict on Azure
250
+ conflict = False
251
+ deleted = False
252
+ try:
253
+ await container_client.delete_container()
254
+ deleted = True
255
+ except ResourceNotFoundError:
256
+ deleted = False
257
+ return deleted, conflict
258
+
259
+ async def bucket_exists(self, bucket: str) -> bool:
260
+ container_client = self.service_client.get_container_client(bucket)
261
+ try:
262
+ await container_client.get_container_properties()
263
+ return True
264
+ except ResourceNotFoundError:
265
+ return False
266
+
267
+ async def bucket_schedule_delete(self, bucket: str) -> None:
268
+ # In Azure, there is no option to schedule for deletion
269
+ await self.bucket_delete(bucket)
270
+
271
+ async def move(
272
+ self,
273
+ origin_bucket: str,
274
+ origin_key: str,
275
+ destination_bucket: str,
276
+ destination_key: str,
277
+ ) -> None:
278
+ await self.copy(origin_bucket, origin_key, destination_bucket, destination_key)
279
+ await self.delete(origin_bucket, origin_key)
280
+
281
+ async def copy(
282
+ self,
283
+ origin_bucket: str,
284
+ origin_key: str,
285
+ destination_bucket: str,
286
+ destination_key: str,
287
+ ) -> None:
288
+ origin_blob_client = self.service_client.get_blob_client(origin_bucket, origin_key)
289
+ origin_url = origin_blob_client.url
290
+ destination_blob_client = self.service_client.get_blob_client(
291
+ destination_bucket, destination_key
292
+ )
293
+ result = await destination_blob_client.start_copy_from_url(origin_url, requires_sync=True)
294
+ assert result["copy_status"] == "success"
295
+
296
+ async def delete(self, bucket: str, key: str) -> None:
297
+ container_client = self.service_client.get_container_client(bucket)
298
+ try:
299
+ await container_client.delete_blob(key, delete_snapshots="include")
300
+ except ResourceNotFoundError:
301
+ raise ObjectNotFoundError()
302
+
303
+ async def upload(
304
+ self,
305
+ bucket: str,
306
+ key: str,
307
+ data: Union[bytes, AsyncGenerator[bytes, None]],
308
+ metadata: ObjectMetadata,
309
+ ) -> None:
310
+ container_client = self.service_client.get_container_client(bucket)
311
+ length: Optional[int] = None
312
+ if isinstance(data, bytes):
313
+ length = len(data)
314
+ metadata.size = length
315
+ else:
316
+ length = metadata.size or None
317
+ custom_metadata = {key: str(value) for key, value in metadata.model_dump().items()}
318
+ await container_client.upload_blob(
319
+ name=key,
320
+ data=data,
321
+ length=length,
322
+ blob_type=BlobType.BLOCKBLOB,
323
+ metadata=custom_metadata,
324
+ content_settings=ContentSettings(
325
+ content_type=metadata.content_type,
326
+ content_disposition=f"attachment; filename={metadata.filename}",
327
+ ),
328
+ )
329
+
330
+ async def download(self, bucket: str, key: str) -> bytes:
331
+ container_client = self.service_client.get_container_client(bucket)
332
+ blob_client = container_client.get_blob_client(key)
333
+ try:
334
+ downloader = await blob_client.download_blob()
335
+ except ResourceNotFoundError:
336
+ raise ObjectNotFoundError()
337
+ return await downloader.readall()
338
+
339
+ async def download_stream(
340
+ self, bucket: str, key: str, range: Optional[Range] = None
341
+ ) -> AsyncGenerator[bytes, None]:
342
+ range = range or Range()
343
+ container_client = self.service_client.get_container_client(bucket)
344
+ blob_client = container_client.get_blob_client(key)
345
+ offset = None
346
+ length = None
347
+ if range.any():
348
+ offset = range.start or 0
349
+ length = range.end - offset + 1 if range.end else None
350
+ try:
351
+ downloader = await blob_client.download_blob(
352
+ offset=offset, # type: ignore
353
+ length=length, # type: ignore
354
+ )
355
+ except ResourceNotFoundError:
356
+ raise ObjectNotFoundError()
357
+ async for chunk in downloader.chunks():
358
+ yield chunk
359
+
360
+ async def iterate(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
361
+ container_client = self.service_client.get_container_client(bucket)
362
+ async for blob in container_client.list_blobs(name_starts_with=prefix):
363
+ yield ObjectInfo(name=blob.name)
364
+
365
+ async def get_metadata(self, bucket: str, key: str) -> ObjectMetadata:
366
+ container_client = self.service_client.get_container_client(bucket)
367
+ blob_client = container_client.get_blob_client(key)
368
+ try:
369
+ properties: BlobProperties = await blob_client.get_blob_properties()
370
+ return parse_object_metadata(properties, key)
371
+ except ResourceNotFoundError:
372
+ raise ObjectNotFoundError()
373
+
374
+ async def upload_multipart_start(self, bucket: str, key: str, metadata: ObjectMetadata) -> None:
375
+ container_client = self.service_client.get_container_client(bucket)
376
+ custom_metadata = {key: str(value) for key, value in metadata.model_dump().items()}
377
+ blob_client = container_client.get_blob_client(key)
378
+ await blob_client.create_append_blob(
379
+ metadata=custom_metadata,
380
+ content_settings=ContentSettings(
381
+ content_type=metadata.content_type,
382
+ content_disposition=f"attachment; filename={metadata.filename}",
383
+ ),
384
+ )
385
+
386
+ async def upload_multipart_append(
387
+ self, bucket: str, key: str, iterable: AsyncIterator[bytes]
388
+ ) -> int:
389
+ container_client = self.service_client.get_container_client(bucket)
390
+ blob_client = container_client.get_blob_client(key)
391
+ bytes_appended = 0
392
+ async for chunk in iterable:
393
+ bytes_appended += len(chunk)
394
+ await blob_client.append_block(data=chunk)
395
+ return bytes_appended
396
+
397
+ async def upload_multipart_finish(self, bucket: str, key: str) -> None:
398
+ # No need to do anything in Azure
399
+ pass
400
+
401
+
402
+ def parse_object_metadata(properties: BlobProperties, key: str) -> ObjectMetadata:
403
+ custom_metadata = properties.metadata or {}
404
+ custom_metadata_size = custom_metadata.get("size")
405
+ if custom_metadata_size and custom_metadata_size != "0":
406
+ size = int(custom_metadata_size)
407
+ else:
408
+ size = properties.size
409
+ filename = custom_metadata.get("filename") or key.split("/")[-1]
410
+ content_type = custom_metadata.get("content_type") or properties.content_settings.content_type or ""
411
+ return ObjectMetadata(
412
+ filename=filename,
413
+ size=size,
414
+ content_type=content_type,
415
+ )
@@ -75,3 +75,9 @@ class UnparsableResponse(Exception):
75
75
  Raised when trying to parse a response from a storage API and it's not
76
76
  possible
77
77
  """
78
+
79
+
80
+ class ObjectNotFoundError(Exception):
81
+ """
82
+ Raised when the object is not found in storage
83
+ """
@@ -47,13 +47,8 @@ from nucliadb_utils.storages.exceptions import (
47
47
  InvalidOffset,
48
48
  ResumableUploadGone,
49
49
  )
50
- from nucliadb_utils.storages.storage import (
51
- ObjectInfo,
52
- ObjectMetadata,
53
- Range,
54
- Storage,
55
- StorageField,
56
- )
50
+ from nucliadb_utils.storages.storage import Storage, StorageField
51
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
57
52
 
58
53
  storage_ops_observer = metrics.Observer("gcs_ops", labels={"type": ""})
59
54
 
@@ -570,8 +565,8 @@ class GCSStorage(Storage):
570
565
  async def create_bucket(self, bucket_name: str, kbid: Optional[str] = None):
571
566
  if self.session is None:
572
567
  raise AttributeError()
573
- exists = await self.check_exists(bucket_name=bucket_name)
574
- if exists:
568
+
569
+ if await self.check_exists(bucket_name=bucket_name):
575
570
  return
576
571
 
577
572
  headers = await self.get_access_headers()
@@ -30,13 +30,8 @@ import aiofiles
30
30
 
31
31
  from nucliadb_protos.resources_pb2 import CloudFile
32
32
  from nucliadb_utils.storages import CHUNK_SIZE
33
- from nucliadb_utils.storages.storage import (
34
- ObjectInfo,
35
- ObjectMetadata,
36
- Range,
37
- Storage,
38
- StorageField,
39
- )
33
+ from nucliadb_utils.storages.storage import Storage, StorageField
34
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
40
35
 
41
36
 
42
37
  class LocalStorageField(StorageField):
@@ -79,6 +74,7 @@ class LocalStorageField(StorageField):
79
74
  shutil.copy(origin_path, destination_path)
80
75
 
81
76
  async def iter_data(self, range: Optional[Range] = None) -> AsyncGenerator[bytes, None]:
77
+ range = range or Range()
82
78
  key = self.field.uri if self.field else self.key
83
79
  if self.field is None:
84
80
  bucket = self.bucket
@@ -87,13 +83,13 @@ class LocalStorageField(StorageField):
87
83
 
88
84
  path = self.storage.get_file_path(bucket, key)
89
85
  async with aiofiles.open(path, mode="rb") as resp:
90
- if range and range.start is not None:
86
+ if range.start is not None:
91
87
  # Seek to the start of the range
92
88
  await resp.seek(range.start)
93
89
 
94
90
  bytes_read = 0
95
91
  bytes_to_read = None # If None, read until EOF
96
- if range and range.end is not None:
92
+ if range.end is not None:
97
93
  # Range is inclusive
98
94
  bytes_to_read = range.end - (range.start or 0) + 1
99
95
 
@@ -0,0 +1,135 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import abc
22
+ from typing import AsyncGenerator, AsyncIterator, Optional, Union
23
+
24
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
25
+
26
+
27
+ class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
28
+ """
29
+ Generic interface for object storage services.
30
+ This must NOT include any NucliaDB/Nuclia specific logic.
31
+ """
32
+
33
+ @abc.abstractmethod
34
+ async def initialize(self) -> None: ...
35
+
36
+ @abc.abstractmethod
37
+ async def finalize(self) -> None: ...
38
+
39
+ @abc.abstractmethod
40
+ async def bucket_create(self, bucket: str, labels: Optional[dict[str, str]] = None) -> bool:
41
+ """
42
+ Create a new bucket in the object storage. Labels the bucket with the given labels if provided.
43
+ Returns True if the bucket was created, False if it already existed.
44
+ """
45
+ ...
46
+
47
+ @abc.abstractmethod
48
+ async def bucket_exists(self, bucket: str) -> bool:
49
+ """
50
+ Return True if the bucket exists, False otherwise.
51
+ """
52
+ ...
53
+
54
+ @abc.abstractmethod
55
+ async def bucket_delete(self, bucket: str) -> tuple[bool, bool]:
56
+ """
57
+ Delete a bucket in the object storage. Returns a tuple with two boolean values:
58
+ - The first one indicates if the bucket was deleted.
59
+ - The second one indicates if there was a conflict.
60
+ """
61
+ ...
62
+
63
+ @abc.abstractmethod
64
+ async def bucket_schedule_delete(self, bucket: str) -> None:
65
+ """
66
+ Mark a bucket for deletion. The bucket will be deleted asynchronously.
67
+ """
68
+ ...
69
+
70
+ @abc.abstractmethod
71
+ async def move(
72
+ self,
73
+ origin_bucket: str,
74
+ origin_key: str,
75
+ destination_bucket: str,
76
+ destination_key: str,
77
+ ) -> None: ...
78
+
79
+ @abc.abstractmethod
80
+ async def copy(
81
+ self,
82
+ origin_bucket: str,
83
+ origin_key: str,
84
+ destination_bucket: str,
85
+ destination_key: str,
86
+ ) -> None: ...
87
+
88
+ @abc.abstractmethod
89
+ async def delete(self, bucket: str, key: str) -> None: ...
90
+
91
+ @abc.abstractmethod
92
+ async def upload(
93
+ self,
94
+ bucket: str,
95
+ key: str,
96
+ data: Union[bytes, AsyncGenerator[bytes, None]],
97
+ metadata: ObjectMetadata,
98
+ ) -> None: ...
99
+
100
+ @abc.abstractmethod
101
+ async def download(self, bucket: str, key: str) -> bytes: ...
102
+
103
+ @abc.abstractmethod
104
+ async def download_stream(
105
+ self, bucket: str, key: str, range: Optional[Range] = None
106
+ ) -> AsyncGenerator[bytes, None]:
107
+ raise NotImplementedError()
108
+ yield b""
109
+
110
+ @abc.abstractmethod
111
+ async def iterate(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
112
+ raise NotImplementedError()
113
+ yield ObjectInfo(name="")
114
+
115
+ @abc.abstractmethod
116
+ async def get_metadata(self, bucket: str, key: str) -> ObjectMetadata: ...
117
+
118
+ @abc.abstractmethod
119
+ async def upload_multipart_start(
120
+ self, bucket: str, key: str, metadata: ObjectMetadata
121
+ ) -> Optional[str]:
122
+ """
123
+ Start a multipart upload. May return the url for the resumable upload.
124
+ """
125
+
126
+ @abc.abstractmethod
127
+ async def upload_multipart_append(
128
+ self, bucket: str, key: str, iterable: AsyncIterator[bytes]
129
+ ) -> int:
130
+ """
131
+ Append data to a multipart upload. Returns the number of bytes uploaded.
132
+ """
133
+
134
+ @abc.abstractmethod
135
+ async def upload_multipart_finish(self, bucket: str, key: str) -> None: ...
@@ -34,13 +34,8 @@ from nucliadb_protos.resources_pb2 import CloudFile
34
34
  from nucliadb_telemetry import errors
35
35
  from nucliadb_utils import logger
36
36
  from nucliadb_utils.storages.exceptions import UnparsableResponse
37
- from nucliadb_utils.storages.storage import (
38
- ObjectInfo,
39
- ObjectMetadata,
40
- Range,
41
- Storage,
42
- StorageField,
43
- )
37
+ from nucliadb_utils.storages.storage import Storage, StorageField
38
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
44
39
 
45
40
  MB = 1024 * 1024
46
41
  MIN_UPLOAD_SIZE = 5 * MB
@@ -22,7 +22,6 @@ from __future__ import annotations
22
22
  import abc
23
23
  import hashlib
24
24
  import uuid
25
- from dataclasses import dataclass
26
25
  from io import BytesIO
27
26
  from typing import (
28
27
  Any,
@@ -36,8 +35,6 @@ from typing import (
36
35
  cast,
37
36
  )
38
37
 
39
- from pydantic import BaseModel
40
-
41
38
  from nucliadb_protos.noderesources_pb2 import Resource as BrainResource
42
39
  from nucliadb_protos.nodewriter_pb2 import IndexMessage
43
40
  from nucliadb_protos.resources_pb2 import CloudFile
@@ -46,6 +43,7 @@ from nucliadb_utils import logger
46
43
  from nucliadb_utils.helpers import async_gen_lookahead
47
44
  from nucliadb_utils.storages import CHUNK_SIZE
48
45
  from nucliadb_utils.storages.exceptions import IndexDataNotFound, InvalidCloudFile
46
+ from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
49
47
  from nucliadb_utils.utilities import get_local_storage, get_nuclia_storage
50
48
 
51
49
  STORAGE_RESOURCE = "kbs/{kbid}/r/{uuid}"
@@ -61,33 +59,6 @@ INDEXING_KEY = "index/{kb}/{shard}/{resource}/{txid}"
61
59
  MESSAGE_KEY = "message/{kbid}/{rid}/{mid}"
62
60
 
63
61
 
64
- class ObjectInfo(BaseModel):
65
- name: str
66
-
67
-
68
- class ObjectMetadata(BaseModel):
69
- filename: str
70
- content_type: str
71
- size: int
72
-
73
-
74
- @dataclass
75
- class Range:
76
- """
77
- Represents a range of bytes to be downloaded from a file. The range is inclusive.
78
- The start and end values are 0-based.
79
- """
80
-
81
- start: Optional[int] = None
82
- end: Optional[int] = None
83
-
84
- def any(self) -> bool:
85
- return self.start is not None or self.end is not None
86
-
87
- def to_header(self) -> str:
88
- return f"bytes={self.start or 0}-{self.end or ''}"
89
-
90
-
91
62
  class StorageField(abc.ABC, metaclass=abc.ABCMeta):
92
63
  storage: Storage
93
64
  bucket: str
@@ -146,7 +117,10 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
146
117
  async def start(self, cf: CloudFile) -> CloudFile: ...
147
118
 
148
119
  @abc.abstractmethod
149
- async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int: ...
120
+ async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int:
121
+ """
122
+ Returns the number of bytes appended.
123
+ """
150
124
 
151
125
  @abc.abstractmethod
152
126
  async def finish(self): ...
@@ -0,0 +1,51 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from dataclasses import dataclass
22
+ from typing import Optional
23
+
24
+ from pydantic import BaseModel
25
+
26
+
27
+ class ObjectInfo(BaseModel):
28
+ name: str
29
+
30
+
31
+ class ObjectMetadata(BaseModel):
32
+ filename: str
33
+ content_type: str
34
+ size: int
35
+
36
+
37
+ @dataclass
38
+ class Range:
39
+ """
40
+ Represents a range of bytes to be downloaded from a file. The range is inclusive.
41
+ The start and end values are 0-based.
42
+ """
43
+
44
+ start: Optional[int] = None
45
+ end: Optional[int] = None
46
+
47
+ def any(self) -> bool:
48
+ return self.start is not None or self.end is not None
49
+
50
+ def to_header(self) -> str:
51
+ return f"bytes={self.start or 0}-{self.end or ''}"
@@ -0,0 +1,119 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from dataclasses import dataclass
21
+ from typing import Generator
22
+
23
+ import pytest
24
+ from pytest_docker_fixtures import images # type: ignore
25
+ from pytest_docker_fixtures.containers._base import BaseImage # type: ignore
26
+
27
+ from nucliadb_utils.storages.azure import AzureStorage
28
+ from nucliadb_utils.store import MAIN
29
+ from nucliadb_utils.utilities import Utility
30
+
31
+ images.settings["azurite"] = {
32
+ "image": "mcr.microsoft.com/azure-storage/azurite",
33
+ "version": "3.30.0",
34
+ "options": {
35
+ "ports": {"10000": None},
36
+ "command": " ".join(
37
+ [
38
+ # To start the blob service only -- by default is on port 10000
39
+ "azurite-blob",
40
+ # So we can access it from outside the container
41
+ "--blobHost 0.0.0.0",
42
+ ]
43
+ ),
44
+ },
45
+ "env": {},
46
+ }
47
+
48
+
49
+ class Azurite(BaseImage):
50
+ name = "azurite"
51
+ port = 10000
52
+
53
+ def check(self):
54
+ try:
55
+ from azure.storage.blob import BlobServiceClient # type: ignore
56
+
57
+ container_port = self.port
58
+ host_port = self.get_port(port=container_port)
59
+ conn_string = get_connection_string(self.host, host_port)
60
+
61
+ client = BlobServiceClient.from_connection_string(conn_string)
62
+ container_client = client.get_container_client("foo")
63
+ container_client.create_container()
64
+ container_client.delete_container()
65
+ return True
66
+ except Exception as ex:
67
+ print(ex)
68
+ return False
69
+
70
+
71
+ @dataclass
72
+ class AzuriteFixture:
73
+ host: str
74
+ port: int
75
+ container: BaseImage
76
+ connection_string: str
77
+
78
+
79
+ def get_connection_string(host, port) -> str:
80
+ """
81
+ We're using the default Azurite credentials for testing purposes.
82
+ """
83
+ parts = [
84
+ "DefaultEndpointsProtocol=http",
85
+ "AccountName=devstoreaccount1",
86
+ "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
87
+ f"BlobEndpoint=http://{host}:{port}/devstoreaccount1",
88
+ ]
89
+ return ";".join(parts)
90
+
91
+
92
+ @pytest.fixture(scope="session")
93
+ def azurite() -> Generator[AzuriteFixture, None, None]:
94
+ container = Azurite()
95
+ host, port = container.run()
96
+ try:
97
+ yield AzuriteFixture(
98
+ host=host,
99
+ port=port,
100
+ container=container.container_obj,
101
+ connection_string=get_connection_string(host, port),
102
+ )
103
+ finally:
104
+ container.stop()
105
+
106
+
107
+ @pytest.fixture(scope="function")
108
+ async def azure_storage(azurite):
109
+ storage = AzureStorage(
110
+ connection_string=azurite.connection_string,
111
+ )
112
+ MAIN[Utility.STORAGE] = storage
113
+ await storage.initialize()
114
+ try:
115
+ yield storage
116
+ finally:
117
+ await storage.finalize()
118
+ if Utility.STORAGE in MAIN:
119
+ del MAIN[Utility.STORAGE]
@@ -24,6 +24,7 @@ from functools import partial
24
24
  from typing import Any, Optional, Union
25
25
 
26
26
  import nats
27
+ import nats.errors
27
28
  from nats.aio.client import Client
28
29
  from nats.js.client import JetStreamContext
29
30
 
@@ -53,6 +54,10 @@ class TransactionCommitTimeoutError(Exception):
53
54
  pass
54
55
 
55
56
 
57
+ class MaxTransactionSizeExceededError(Exception):
58
+ pass
59
+
60
+
56
61
  class LocalTransactionUtility:
57
62
  async def commit(
58
63
  self,
@@ -195,7 +200,10 @@ class TransactionUtility:
195
200
  if target_subject is None:
196
201
  target_subject = const.Streams.INGEST.subject.format(partition=partition)
197
202
 
198
- res = await self.js.publish(target_subject, writer.SerializeToString(), headers=headers)
203
+ try:
204
+ res = await self.js.publish(target_subject, writer.SerializeToString(), headers=headers)
205
+ except nats.errors.MaxPayloadError as ex:
206
+ raise MaxTransactionSizeExceededError() from ex
199
207
 
200
208
  waiting_for.seq = res.seq
201
209
 
@@ -98,7 +98,21 @@ async def get_storage(
98
98
  if Utility.STORAGE in MAIN:
99
99
  return MAIN[Utility.STORAGE]
100
100
 
101
- if storage_settings.file_backend == FileBackendConfig.S3:
101
+ if storage_settings.file_backend == FileBackendConfig.AZURE:
102
+ from nucliadb_utils.storages.azure import AzureStorage
103
+
104
+ if storage_settings.azure_connection_string is None:
105
+ raise ConfigurationError("AZURE_CONNECTION_STRING env var not configured")
106
+
107
+ azureutil = AzureStorage(
108
+ connection_string=storage_settings.azure_connection_string,
109
+ )
110
+
111
+ logger.info("Configuring Azure Storage")
112
+ await azureutil.initialize()
113
+ set_utility(Utility.STORAGE, azureutil)
114
+
115
+ elif storage_settings.file_backend == FileBackendConfig.S3:
102
116
  from nucliadb_utils.storages.s3 import S3Storage
103
117
 
104
118
  s3util = S3Storage(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nucliadb_utils
3
- Version: 4.0.3.post590
3
+ Version: 4.0.3.post592
4
4
  Home-page: https://nuclia.com
5
5
  License: BSD
6
6
  Classifier: Development Status :: 4 - Beta
@@ -23,8 +23,8 @@ Requires-Dist: PyNaCl
23
23
  Requires-Dist: pyjwt >=2.4.0
24
24
  Requires-Dist: memorylru >=1.1.2
25
25
  Requires-Dist: mrflagly
26
- Requires-Dist: nucliadb-protos >=4.0.3.post590
27
- Requires-Dist: nucliadb-telemetry >=4.0.3.post590
26
+ Requires-Dist: nucliadb-protos >=4.0.3.post592
27
+ Requires-Dist: nucliadb-telemetry >=4.0.3.post592
28
28
  Provides-Extra: cache
29
29
  Requires-Dist: redis >=4.3.4 ; extra == 'cache'
30
30
  Requires-Dist: orjson >=3.6.7 ; extra == 'cache'
@@ -43,6 +43,7 @@ Requires-Dist: types-aiofiles >=0.8.3 ; extra == 'storages'
43
43
  Requires-Dist: aiofiles >=0.8.0 ; extra == 'storages'
44
44
  Requires-Dist: backoff >=1.11.1 ; extra == 'storages'
45
45
  Requires-Dist: google-auth >=2.4.1 ; extra == 'storages'
46
+ Requires-Dist: azure-storage-blob >=12.20.0 ; extra == 'storages'
46
47
 
47
48
  # nucliadb util python library
48
49
 
@@ -12,11 +12,11 @@ nucliadb_utils/nats.py,sha256=zTAXECDXeCPtydk3F_6EMFDZ059kK0UYUU_tnWoxgXs,8208
12
12
  nucliadb_utils/partition.py,sha256=jBgy4Hu5Iwn4gjbPPcthSykwf-qNx-GcLAIwbzPd1d0,1157
13
13
  nucliadb_utils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  nucliadb_utils/run.py,sha256=HpAIM8xbR7UpVC2_7xOjB4fYbUVykyPP6yHrv2RD3DI,1707
15
- nucliadb_utils/settings.py,sha256=fI3AOn30tNvYS_PqoKilVhJN4OppPAGCM6-OUUitO2s,7192
15
+ nucliadb_utils/settings.py,sha256=VXZuq-4-RvsUsMIjL-wRRjzCqM-b_AnTHsW_hGrax_o,7281
16
16
  nucliadb_utils/signals.py,sha256=JRNv2y9zLtBjOANBf7krGfDGfOc9qcoXZ6N1nKWS2FE,2674
17
17
  nucliadb_utils/store.py,sha256=kQ35HemE0v4_Qg6xVqNIJi8vSFAYQtwI3rDtMsNy62Y,890
18
- nucliadb_utils/transaction.py,sha256=ym9hmPAoIt8xgxjd8JHG14_PelYTqhUOVfUAq_ghJDM,7100
19
- nucliadb_utils/utilities.py,sha256=E7W9TzvbyJ7_Yenho9CT059E_g4JQOCS02HrGurwNqs,13603
18
+ nucliadb_utils/transaction.py,sha256=mwcI3aIHAvU5KOGqd_Uz_d1XQzXhk_-NWY8NqU1lfb0,7307
19
+ nucliadb_utils/utilities.py,sha256=Vb7lXUSDtqS_7tNqI4CDQD2woHO_JFUGSjC2Yj4-uEA,14135
20
20
  nucliadb_utils/audit/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
21
21
  nucliadb_utils/audit/audit.py,sha256=dn5ZnCVQUlCcvdjzaORghbrjk9QgVGrtkfIftq30Bp8,2819
22
22
  nucliadb_utils/audit/basic.py,sha256=NViey6mKbCXqRTLDBX2xNTcCg9I-2e4oB2xkekuhDvM,3392
@@ -39,23 +39,27 @@ nucliadb_utils/nuclia_usage/protos/kb_usage_pb2_grpc.pyi,sha256=6RIsZ2934iodEckf
39
39
  nucliadb_utils/nuclia_usage/utils/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
40
40
  nucliadb_utils/nuclia_usage/utils/kb_usage_report.py,sha256=E1eUSFXBVNzQP9Q2rWj9y3koCO5S7iKwckny_AoLKuk,3870
41
41
  nucliadb_utils/storages/__init__.py,sha256=5Qc8AUWiJv9_JbGCBpAn88AIJhwDlm0OPQpg2ZdRL4U,872
42
- nucliadb_utils/storages/exceptions.py,sha256=BfJcn0-60Ts2gLHRTxQKD0QuR7L4WDJtIdsUp7zhQ0k,2395
43
- nucliadb_utils/storages/gcs.py,sha256=KQ9puMOE89CPIA8q8DeCs7qOp0YoB5ZctXPws1h7lbA,27006
44
- nucliadb_utils/storages/local.py,sha256=GAEzvbmLzEeEJhhIWKa-vX2i9B0qdq6mbHMolpa2Q20,10259
42
+ nucliadb_utils/storages/azure.py,sha256=7cWbe144WIwt10IM6XPRThkjeg5M_U0WInj4Jk75THw,15591
43
+ nucliadb_utils/storages/exceptions.py,sha256=mm_wX4YRtp7u7enkk_4pMSlX5AQQuFbq4xLmupVDt3Y,2502
44
+ nucliadb_utils/storages/gcs.py,sha256=WblkxWoa1brevsJV3ebiE6s7Wb_eXFScw41202f5uP4,26999
45
+ nucliadb_utils/storages/local.py,sha256=NxC_nMBd38NDsR266DSgoBLdQlvUwf0_sd50r-BLI0E,10288
45
46
  nucliadb_utils/storages/nuclia.py,sha256=vEv94xAT7QM2g80S25QyrOw2pzvP2BAX-ADgZLtuCVc,2097
46
- nucliadb_utils/storages/s3.py,sha256=ABzS9X3fj7SUq-3cLvnKEClngb8hcPyKNSfxubMpyCo,19256
47
+ nucliadb_utils/storages/object_store.py,sha256=Tw10GmpYfM5TMqJ3Tk9pLQ9wLMBk1-snL_m6uasiZDQ,4257
48
+ nucliadb_utils/storages/s3.py,sha256=8KV-V7EiqRYhXYlGN0UjzM-v1Pj2Zh7NtXDikG96knU,19272
47
49
  nucliadb_utils/storages/settings.py,sha256=ugCPy1zxBOmA2KosT-4tsjpvP002kg5iQyi42yCGCJA,1285
48
- nucliadb_utils/storages/storage.py,sha256=SWeQv6136ruj7TvCPQR6WkG458IDEz2fzQQjkDRRReQ,20533
50
+ nucliadb_utils/storages/storage.py,sha256=Ask2f1xuQHxavF3uKXXrmjOeY7w3ZljpZlcvmIh2EVU,20060
51
+ nucliadb_utils/storages/utils.py,sha256=8g2rIwJeYIumQLOB47Yw1rx3twlhRB_cJxer65QfZmk,1479
49
52
  nucliadb_utils/tests/__init__.py,sha256=Oo9CAE7B0eW5VHn8sHd6o30SQzOWUhktLPRXdlDOleA,1456
50
53
  nucliadb_utils/tests/asyncbenchmark.py,sha256=x4be2IwCawle9zWgMOJkmwoUwk5p1tv7cLQGmybkEOg,10587
54
+ nucliadb_utils/tests/azure.py,sha256=ji-BV54m_MHAB9KdSToYgsZ8OZZ-C5Yq2VqWOYZNMs4,3668
51
55
  nucliadb_utils/tests/fixtures.py,sha256=j58fTvoWZClC52LX7QOvLXX9DS5QbytSnRp0F4nGzN8,1671
52
56
  nucliadb_utils/tests/gcs.py,sha256=Ii8BCHUAAxFIzX67pKTRFRgbqv3FJ6DrPAdAx2Xod1Y,3036
53
57
  nucliadb_utils/tests/indexing.py,sha256=YW2QhkhO9Q_8A4kKWJaWSvXvyQ_AiAwY1VylcfVQFxk,1513
54
58
  nucliadb_utils/tests/local.py,sha256=c3gZJJWmvOftruJkIQIwB3q_hh3uxEhqGIAVWim1Bbk,1343
55
59
  nucliadb_utils/tests/nats.py,sha256=Tosonm9A9cusImyji80G4pgdXEHNVPaCLT5TbFK_ra0,7543
56
60
  nucliadb_utils/tests/s3.py,sha256=YB8QqDaBXxyhHonEHmeBbRRDmvB7sTOaKBSi8KBGokg,2330
57
- nucliadb_utils-4.0.3.post590.dist-info/METADATA,sha256=G1YsUouvYuDg6mQyvsor1uvc9Wed9dCuvZxeRfBmBxw,2030
58
- nucliadb_utils-4.0.3.post590.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
59
- nucliadb_utils-4.0.3.post590.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
60
- nucliadb_utils-4.0.3.post590.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
61
- nucliadb_utils-4.0.3.post590.dist-info/RECORD,,
61
+ nucliadb_utils-4.0.3.post592.dist-info/METADATA,sha256=BkJz8TUDW0BBDWiH06qFHh1kuB4u2QmRHwr33Pgix5M,2096
62
+ nucliadb_utils-4.0.3.post592.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
63
+ nucliadb_utils-4.0.3.post592.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
64
+ nucliadb_utils-4.0.3.post592.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
65
+ nucliadb_utils-4.0.3.post592.dist-info/RECORD,,