PyPI - nucliadb-utils - Versions diffs - 4.0.3.post590__py3-none-any.whl → 4.0.3.post592__py3-none-any.whl - Mend

nucliadb-utils 4.0.3.post590py3-none-any.whl → 4.0.3.post592py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

nucliadb_utils/settings.py CHANGED Viewed

@@ -52,6 +52,7 @@ class FileBackendConfig(Enum):
     GCS = "gcs"
     S3 = "s3"
     LOCAL = "local"
+    AZURE = "azure"
     NOT_SET = "notset"  # setting not provided
     @classmethod
@@ -113,8 +114,10 @@ class StorageSettings(BaseSettings):
         description="Number of days that uploaded files are kept in Nulia's processing engine",
     )
-    driver_pg_url: Optional[str] = None  # match same env var for k/v storage
-    driver_pg_connection_pool_max_size: int = 20  # match same env var for k/v storage
+    azure_connection_string: Optional[str] = Field(
+        default=None,
+        description="Azure Storage connection string: https://docs.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string",  # noqa
+    )
 storage_settings = StorageSettings()

nucliadb_utils/storages/azure.py ADDED Viewed

@@ -0,0 +1,415 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from __future__ import annotations
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, AsyncIterator, Optional, Union
+from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
+from azure.storage.blob import BlobProperties, BlobType, ContentSettings
+from azure.storage.blob.aio import BlobServiceClient
+from nucliadb_protos.resources_pb2 import CloudFile
+from nucliadb_utils.storages.exceptions import ObjectNotFoundError
+from nucliadb_utils.storages.object_store import ObjectStore
+from nucliadb_utils.storages.storage import Storage, StorageField
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
+logger = logging.getLogger(__name__)
+class AzureStorageField(StorageField):
+    storage: AzureStorage
+    async def move(
+        self,
+        origin_uri: str,
+        destination_uri: str,
+        origin_bucket_name: str,
+        destination_bucket_name: str,
+    ):
+        await self.storage.object_store.move(
+            origin_bucket_name, origin_uri, destination_bucket_name, destination_uri
+        )
+    async def copy(
+        self,
+        origin_uri: str,
+        destination_uri: str,
+        origin_bucket_name: str,
+        destination_bucket_name: str,
+    ):
+        await self.storage.object_store.copy(
+            origin_bucket_name, origin_uri, destination_bucket_name, destination_uri
+        )
+    async def iter_data(self, range: Optional[Range] = None) -> AsyncGenerator[bytes, None]:
+        if self.field is not None:
+            bucket = self.field.bucket_name
+            key = self.field.uri
+        else:
+            bucket = self.bucket
+            key = self.key
+        async for chunk in self.storage.object_store.download_stream(bucket, key, range):
+            yield chunk
+    async def start(self, cf: CloudFile) -> CloudFile:
+        """Init an upload.
+        cf: New file to upload
+        """
+        if self.field is not None and self.field.upload_uri != "":
+            # If there is a temporal url, delete it
+            await self.storage.delete_upload(self.field.upload_uri, self.field.bucket_name)
+        if self.field is not None and self.field.uri != "":
+            field: CloudFile = CloudFile(
+                filename=cf.filename,
+                size=cf.size,
+                content_type=cf.content_type,
+                bucket_name=self.bucket,
+                md5=cf.md5,
+                source=CloudFile.AZURE,
+                old_uri=self.field.uri,
+                old_bucket=self.field.bucket_name,
+            )
+            upload_uri = f"{self.key}-{datetime.now().isoformat()}"
+        else:
+            field = CloudFile(
+                filename=cf.filename,
+                size=cf.size,
+                md5=cf.md5,
+                content_type=cf.content_type,
+                bucket_name=self.bucket,
+                source=CloudFile.AZURE,
+            )
+            upload_uri = self.key
+        await self.storage.object_store.upload_multipart_start(
+            self.bucket,
+            upload_uri,
+            ObjectMetadata(
+                filename=cf.filename,
+                size=cf.size,
+                content_type=cf.content_type,
+            ),
+        )
+        field.offset = 0
+        field.upload_uri = upload_uri
+        return field
+    async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int:
+        if self.field is None:
+            raise AttributeError()
+        return await self.storage.object_store.upload_multipart_append(
+            self.field.bucket_name, self.field.upload_uri, iterable
+        )
+    async def finish(self):
+        self.field.uri = self.key
+        self.field.ClearField("resumable_uri")
+        self.field.ClearField("offset")
+        self.field.ClearField("upload_uri")
+        self.field.ClearField("parts")
+    async def exists(self) -> Optional[ObjectMetadata]:
+        key = None
+        bucket = None
+        if self.field is not None and self.field.uri != "":
+            key = self.field.uri
+            bucket = self.field.bucket_name
+        elif self.key != "":
+            key = self.key
+            bucket = self.bucket
+        else:
+            return None
+        return await self.storage.object_store.get_metadata(bucket, key)
+    async def upload(self, iterator: AsyncIterator, origin: CloudFile) -> CloudFile:
+        self.field = await self.start(origin)
+        if self.field is None:
+            raise AttributeError()
+        await self.append(origin, iterator)
+        await self.finish()
+        return self.field
+    def __repr__(self):
+        return f"{self.storage.source}: {self.bucket}/{self.key}"
+class AzureStorage(Storage):
+    field_klass = AzureStorageField
+    object_store: ObjectStore
+    source = CloudFile.AZURE
+    def __init__(
+        self,
+        connection_string: str,
+        deadletter_bucket: str = "deadletter",
+        indexing_bucket: str = "indexing",
+    ):
+        self.object_store = AzureObjectStore(connection_string)
+        self.deadletter_bucket = deadletter_bucket
+        self.indexing_bucket = indexing_bucket
+    async def initialize(self, service_name: Optional[str] = None):
+        await self.object_store.initialize()
+        for bucket in [
+            self.deadletter_bucket,
+            self.indexing_bucket,
+        ]:
+            if bucket is None or bucket == "":
+                continue
+            try:
+                await self.object_store.bucket_create(bucket)
+            except Exception:
+                logger.exception(f"Could not create bucket {bucket}", exc_info=True)
+    async def finalize(self):
+        await self.object_store.finalize()
+    async def delete_upload(self, uri: str, bucket_name: str):
+        await self.object_store.delete(bucket_name, uri)
+    async def create_bucket(self, bucket_name: str, kbid: Optional[str] = None):
+        if await self.object_store.bucket_exists(bucket_name):
+            return
+        await self.object_store.bucket_create(bucket_name)
+    def get_bucket_name(self, kbid: str):
+        return f"nucliadb-{kbid}"
+    async def create_kb(self, kbid: str) -> bool:
+        bucket_name = self.get_bucket_name(kbid)
+        return await self.object_store.bucket_create(bucket_name)
+    async def schedule_delete_kb(self, kbid: str) -> bool:
+        bucket_name = self.get_bucket_name(kbid)
+        deleted, _ = await self.object_store.bucket_delete(bucket_name)
+        return deleted
+    async def delete_kb(self, kbid: str) -> tuple[bool, bool]:
+        bucket_name = self.get_bucket_name(kbid)
+        return await self.object_store.bucket_delete(bucket_name)
+    async def iterate_objects(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
+        async for obj in self.object_store.iterate(bucket, prefix):
+            yield obj
+class AzureObjectStore(ObjectStore):
+    def __init__(self, connection_string: str):
+        self.connection_string = connection_string
+        self._service_client: Optional[BlobServiceClient] = None
+    @property
+    def service_client(self) -> BlobServiceClient:
+        if self._service_client is None:
+            raise AttributeError("Service client not initialized")
+        return self._service_client
+    async def initialize(self):
+        self._service_client = BlobServiceClient.from_connection_string(self.connection_string)
+    async def finalize(self):
+        try:
+            if self._service_client is not None:
+                await self._service_client.close()
+        except Exception:
+            logger.warning("Error closing Azure client", exc_info=True)
+        self._service_client = None
+    async def bucket_create(self, bucket: str, labels: dict[str, str] | None = None) -> bool:
+        container_client = self.service_client.get_container_client(bucket)
+        try:
+            await container_client.create_container()
+            return True
+        except ResourceExistsError:
+            return False
+    async def bucket_delete(self, bucket: str) -> tuple[bool, bool]:
+        container_client = self.service_client.get_container_client(bucket)
+        # There's never a conflict on Azure
+        conflict = False
+        deleted = False
+        try:
+            await container_client.delete_container()
+            deleted = True
+        except ResourceNotFoundError:
+            deleted = False
+        return deleted, conflict
+    async def bucket_exists(self, bucket: str) -> bool:
+        container_client = self.service_client.get_container_client(bucket)
+        try:
+            await container_client.get_container_properties()
+            return True
+        except ResourceNotFoundError:
+            return False
+    async def bucket_schedule_delete(self, bucket: str) -> None:
+        # In Azure, there is no option to schedule for deletion
+        await self.bucket_delete(bucket)
+    async def move(
+        self,
+        origin_bucket: str,
+        origin_key: str,
+        destination_bucket: str,
+        destination_key: str,
+    ) -> None:
+        await self.copy(origin_bucket, origin_key, destination_bucket, destination_key)
+        await self.delete(origin_bucket, origin_key)
+    async def copy(
+        self,
+        origin_bucket: str,
+        origin_key: str,
+        destination_bucket: str,
+        destination_key: str,
+    ) -> None:
+        origin_blob_client = self.service_client.get_blob_client(origin_bucket, origin_key)
+        origin_url = origin_blob_client.url
+        destination_blob_client = self.service_client.get_blob_client(
+            destination_bucket, destination_key
+        )
+        result = await destination_blob_client.start_copy_from_url(origin_url, requires_sync=True)
+        assert result["copy_status"] == "success"
+    async def delete(self, bucket: str, key: str) -> None:
+        container_client = self.service_client.get_container_client(bucket)
+        try:
+            await container_client.delete_blob(key, delete_snapshots="include")
+        except ResourceNotFoundError:
+            raise ObjectNotFoundError()
+    async def upload(
+        self,
+        bucket: str,
+        key: str,
+        data: Union[bytes, AsyncGenerator[bytes, None]],
+        metadata: ObjectMetadata,
+    ) -> None:
+        container_client = self.service_client.get_container_client(bucket)
+        length: Optional[int] = None
+        if isinstance(data, bytes):
+            length = len(data)
+            metadata.size = length
+        else:
+            length = metadata.size or None
+        custom_metadata = {key: str(value) for key, value in metadata.model_dump().items()}
+        await container_client.upload_blob(
+            name=key,
+            data=data,
+            length=length,
+            blob_type=BlobType.BLOCKBLOB,
+            metadata=custom_metadata,
+            content_settings=ContentSettings(
+                content_type=metadata.content_type,
+                content_disposition=f"attachment; filename={metadata.filename}",
+            ),
+        )
+    async def download(self, bucket: str, key: str) -> bytes:
+        container_client = self.service_client.get_container_client(bucket)
+        blob_client = container_client.get_blob_client(key)
+        try:
+            downloader = await blob_client.download_blob()
+        except ResourceNotFoundError:
+            raise ObjectNotFoundError()
+        return await downloader.readall()
+    async def download_stream(
+        self, bucket: str, key: str, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
+        range = range or Range()
+        container_client = self.service_client.get_container_client(bucket)
+        blob_client = container_client.get_blob_client(key)
+        offset = None
+        length = None
+        if range.any():
+            offset = range.start or 0
+            length = range.end - offset + 1 if range.end else None
+        try:
+            downloader = await blob_client.download_blob(
+                offset=offset,  # type: ignore
+                length=length,  # type: ignore
+            )
+        except ResourceNotFoundError:
+            raise ObjectNotFoundError()
+        async for chunk in downloader.chunks():
+            yield chunk
+    async def iterate(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
+        container_client = self.service_client.get_container_client(bucket)
+        async for blob in container_client.list_blobs(name_starts_with=prefix):
+            yield ObjectInfo(name=blob.name)
+    async def get_metadata(self, bucket: str, key: str) -> ObjectMetadata:
+        container_client = self.service_client.get_container_client(bucket)
+        blob_client = container_client.get_blob_client(key)
+        try:
+            properties: BlobProperties = await blob_client.get_blob_properties()
+            return parse_object_metadata(properties, key)
+        except ResourceNotFoundError:
+            raise ObjectNotFoundError()
+    async def upload_multipart_start(self, bucket: str, key: str, metadata: ObjectMetadata) -> None:
+        container_client = self.service_client.get_container_client(bucket)
+        custom_metadata = {key: str(value) for key, value in metadata.model_dump().items()}
+        blob_client = container_client.get_blob_client(key)
+        await blob_client.create_append_blob(
+            metadata=custom_metadata,
+            content_settings=ContentSettings(
+                content_type=metadata.content_type,
+                content_disposition=f"attachment; filename={metadata.filename}",
+            ),
+        )
+    async def upload_multipart_append(
+        self, bucket: str, key: str, iterable: AsyncIterator[bytes]
+    ) -> int:
+        container_client = self.service_client.get_container_client(bucket)
+        blob_client = container_client.get_blob_client(key)
+        bytes_appended = 0
+        async for chunk in iterable:
+            bytes_appended += len(chunk)
+            await blob_client.append_block(data=chunk)
+        return bytes_appended
+    async def upload_multipart_finish(self, bucket: str, key: str) -> None:
+        # No need to do anything in Azure
+        pass
+def parse_object_metadata(properties: BlobProperties, key: str) -> ObjectMetadata:
+    custom_metadata = properties.metadata or {}
+    custom_metadata_size = custom_metadata.get("size")
+    if custom_metadata_size and custom_metadata_size != "0":
+        size = int(custom_metadata_size)
+    else:
+        size = properties.size
+    filename = custom_metadata.get("filename") or key.split("/")[-1]
+    content_type = custom_metadata.get("content_type") or properties.content_settings.content_type or ""
+    return ObjectMetadata(
+        filename=filename,
+        size=size,
+        content_type=content_type,
+    )

nucliadb_utils/storages/exceptions.py CHANGED Viewed

@@ -75,3 +75,9 @@ class UnparsableResponse(Exception):
     Raised when trying to parse a response from a storage API and it's not
     possible
     """
+class ObjectNotFoundError(Exception):
+    """
+    Raised when the object is not found in storage
+    """

nucliadb_utils/storages/gcs.py CHANGED Viewed

@@ -47,13 +47,8 @@ from nucliadb_utils.storages.exceptions import (
     InvalidOffset,
     ResumableUploadGone,
 )
-from nucliadb_utils.storages.storage import (
-    ObjectInfo,
-    ObjectMetadata,
-    Range,
-    Storage,
-    StorageField,
-)
+from nucliadb_utils.storages.storage import Storage, StorageField
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
 storage_ops_observer = metrics.Observer("gcs_ops", labels={"type": ""})
@@ -570,8 +565,8 @@ class GCSStorage(Storage):
     async def create_bucket(self, bucket_name: str, kbid: Optional[str] = None):
         if self.session is None:
             raise AttributeError()
-        exists = await self.check_exists(bucket_name=bucket_name)
-        if exists:
+        if await self.check_exists(bucket_name=bucket_name):
             return
         headers = await self.get_access_headers()

nucliadb_utils/storages/local.py CHANGED Viewed

@@ -30,13 +30,8 @@ import aiofiles
 from nucliadb_protos.resources_pb2 import CloudFile
 from nucliadb_utils.storages import CHUNK_SIZE
-from nucliadb_utils.storages.storage import (
-    ObjectInfo,
-    ObjectMetadata,
-    Range,
-    Storage,
-    StorageField,
-)
+from nucliadb_utils.storages.storage import Storage, StorageField
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
 class LocalStorageField(StorageField):
@@ -79,6 +74,7 @@ class LocalStorageField(StorageField):
         shutil.copy(origin_path, destination_path)
     async def iter_data(self, range: Optional[Range] = None) -> AsyncGenerator[bytes, None]:
+        range = range or Range()
         key = self.field.uri if self.field else self.key
         if self.field is None:
             bucket = self.bucket
@@ -87,13 +83,13 @@ class LocalStorageField(StorageField):
         path = self.storage.get_file_path(bucket, key)
         async with aiofiles.open(path, mode="rb") as resp:
-            if range and range.start is not None:
+            if range.start is not None:
                 # Seek to the start of the range
                 await resp.seek(range.start)
             bytes_read = 0
             bytes_to_read = None  # If None, read until EOF
-            if range and range.end is not None:
+            if range.end is not None:
                 # Range is inclusive
                 bytes_to_read = range.end - (range.start or 0) + 1

nucliadb_utils/storages/object_store.py ADDED Viewed

@@ -0,0 +1,135 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+import abc
+from typing import AsyncGenerator, AsyncIterator, Optional, Union
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
+class ObjectStore(abc.ABC, metaclass=abc.ABCMeta):
+    """
+    Generic interface for object storage services.
+    This must NOT include any NucliaDB/Nuclia specific logic.
+    """
+    @abc.abstractmethod
+    async def initialize(self) -> None: ...
+    @abc.abstractmethod
+    async def finalize(self) -> None: ...
+    @abc.abstractmethod
+    async def bucket_create(self, bucket: str, labels: Optional[dict[str, str]] = None) -> bool:
+        """
+        Create a new bucket in the object storage. Labels the bucket with the given labels if provided.
+        Returns True if the bucket was created, False if it already existed.
+        """
+        ...
+    @abc.abstractmethod
+    async def bucket_exists(self, bucket: str) -> bool:
+        """
+        Return True if the bucket exists, False otherwise.
+        """
+        ...
+    @abc.abstractmethod
+    async def bucket_delete(self, bucket: str) -> tuple[bool, bool]:
+        """
+        Delete a bucket in the object storage. Returns a tuple with two boolean values:
+        - The first one indicates if the bucket was deleted.
+        - The second one indicates if there was a conflict.
+        """
+        ...
+    @abc.abstractmethod
+    async def bucket_schedule_delete(self, bucket: str) -> None:
+        """
+        Mark a bucket for deletion. The bucket will be deleted asynchronously.
+        """
+        ...
+    @abc.abstractmethod
+    async def move(
+        self,
+        origin_bucket: str,
+        origin_key: str,
+        destination_bucket: str,
+        destination_key: str,
+    ) -> None: ...
+    @abc.abstractmethod
+    async def copy(
+        self,
+        origin_bucket: str,
+        origin_key: str,
+        destination_bucket: str,
+        destination_key: str,
+    ) -> None: ...
+    @abc.abstractmethod
+    async def delete(self, bucket: str, key: str) -> None: ...
+    @abc.abstractmethod
+    async def upload(
+        self,
+        bucket: str,
+        key: str,
+        data: Union[bytes, AsyncGenerator[bytes, None]],
+        metadata: ObjectMetadata,
+    ) -> None: ...
+    @abc.abstractmethod
+    async def download(self, bucket: str, key: str) -> bytes: ...
+    @abc.abstractmethod
+    async def download_stream(
+        self, bucket: str, key: str, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
+        raise NotImplementedError()
+        yield b""
+    @abc.abstractmethod
+    async def iterate(self, bucket: str, prefix: str) -> AsyncGenerator[ObjectInfo, None]:
+        raise NotImplementedError()
+        yield ObjectInfo(name="")
+    @abc.abstractmethod
+    async def get_metadata(self, bucket: str, key: str) -> ObjectMetadata: ...
+    @abc.abstractmethod
+    async def upload_multipart_start(
+        self, bucket: str, key: str, metadata: ObjectMetadata
+    ) -> Optional[str]:
+        """
+        Start a multipart upload. May return the url for the resumable upload.
+        """
+    @abc.abstractmethod
+    async def upload_multipart_append(
+        self, bucket: str, key: str, iterable: AsyncIterator[bytes]
+    ) -> int:
+        """
+        Append data to a multipart upload. Returns the number of bytes uploaded.
+        """
+    @abc.abstractmethod
+    async def upload_multipart_finish(self, bucket: str, key: str) -> None: ...

nucliadb_utils/storages/s3.py CHANGED Viewed

@@ -34,13 +34,8 @@ from nucliadb_protos.resources_pb2 import CloudFile
 from nucliadb_telemetry import errors
 from nucliadb_utils import logger
 from nucliadb_utils.storages.exceptions import UnparsableResponse
-from nucliadb_utils.storages.storage import (
-    ObjectInfo,
-    ObjectMetadata,
-    Range,
-    Storage,
-    StorageField,
-)
+from nucliadb_utils.storages.storage import Storage, StorageField
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
 MB = 1024 * 1024
 MIN_UPLOAD_SIZE = 5 * MB

nucliadb_utils/storages/storage.py CHANGED Viewed

@@ -22,7 +22,6 @@ from __future__ import annotations
 import abc
 import hashlib
 import uuid
-from dataclasses import dataclass
 from io import BytesIO
 from typing import (
     Any,
@@ -36,8 +35,6 @@ from typing import (
     cast,
 )
-from pydantic import BaseModel
 from nucliadb_protos.noderesources_pb2 import Resource as BrainResource
 from nucliadb_protos.nodewriter_pb2 import IndexMessage
 from nucliadb_protos.resources_pb2 import CloudFile
@@ -46,6 +43,7 @@ from nucliadb_utils import logger
 from nucliadb_utils.helpers import async_gen_lookahead
 from nucliadb_utils.storages import CHUNK_SIZE
 from nucliadb_utils.storages.exceptions import IndexDataNotFound, InvalidCloudFile
+from nucliadb_utils.storages.utils import ObjectInfo, ObjectMetadata, Range
 from nucliadb_utils.utilities import get_local_storage, get_nuclia_storage
 STORAGE_RESOURCE = "kbs/{kbid}/r/{uuid}"
@@ -61,33 +59,6 @@ INDEXING_KEY = "index/{kb}/{shard}/{resource}/{txid}"
 MESSAGE_KEY = "message/{kbid}/{rid}/{mid}"
-class ObjectInfo(BaseModel):
-    name: str
-class ObjectMetadata(BaseModel):
-    filename: str
-    content_type: str
-    size: int
-@dataclass
-class Range:
-    """
-    Represents a range of bytes to be downloaded from a file. The range is inclusive.
-    The start and end values are 0-based.
-    """
-    start: Optional[int] = None
-    end: Optional[int] = None
-    def any(self) -> bool:
-        return self.start is not None or self.end is not None
-    def to_header(self) -> str:
-        return f"bytes={self.start or 0}-{self.end or ''}"
 class StorageField(abc.ABC, metaclass=abc.ABCMeta):
     storage: Storage
     bucket: str
@@ -146,7 +117,10 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
     async def start(self, cf: CloudFile) -> CloudFile: ...
     @abc.abstractmethod
-    async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int: ...
+    async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int:
+        """
+        Returns the number of bytes appended.
+        """
     @abc.abstractmethod
     async def finish(self): ...

nucliadb_utils/storages/utils.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from dataclasses import dataclass
+from typing import Optional
+from pydantic import BaseModel
+class ObjectInfo(BaseModel):
+    name: str
+class ObjectMetadata(BaseModel):
+    filename: str
+    content_type: str
+    size: int
+@dataclass
+class Range:
+    """
+    Represents a range of bytes to be downloaded from a file. The range is inclusive.
+    The start and end values are 0-based.
+    """
+    start: Optional[int] = None
+    end: Optional[int] = None
+    def any(self) -> bool:
+        return self.start is not None or self.end is not None
+    def to_header(self) -> str:
+        return f"bytes={self.start or 0}-{self.end or ''}"

nucliadb_utils/tests/azure.py ADDED Viewed

@@ -0,0 +1,119 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from dataclasses import dataclass
+from typing import Generator
+import pytest
+from pytest_docker_fixtures import images  # type: ignore
+from pytest_docker_fixtures.containers._base import BaseImage  # type: ignore
+from nucliadb_utils.storages.azure import AzureStorage
+from nucliadb_utils.store import MAIN
+from nucliadb_utils.utilities import Utility
+images.settings["azurite"] = {
+    "image": "mcr.microsoft.com/azure-storage/azurite",
+    "version": "3.30.0",
+    "options": {
+        "ports": {"10000": None},
+        "command": " ".join(
+            [
+                # To start the blob service only -- by default is on port 10000
+                "azurite-blob",
+                # So we can access it from outside the container
+                "--blobHost 0.0.0.0",
+            ]
+        ),
+    },
+    "env": {},
+}
+class Azurite(BaseImage):
+    name = "azurite"
+    port = 10000
+    def check(self):
+        try:
+            from azure.storage.blob import BlobServiceClient  # type: ignore
+            container_port = self.port
+            host_port = self.get_port(port=container_port)
+            conn_string = get_connection_string(self.host, host_port)
+            client = BlobServiceClient.from_connection_string(conn_string)
+            container_client = client.get_container_client("foo")
+            container_client.create_container()
+            container_client.delete_container()
+            return True
+        except Exception as ex:
+            print(ex)
+            return False
+@dataclass
+class AzuriteFixture:
+    host: str
+    port: int
+    container: BaseImage
+    connection_string: str
+def get_connection_string(host, port) -> str:
+    """
+    We're using the default Azurite credentials for testing purposes.
+    """
+    parts = [
+        "DefaultEndpointsProtocol=http",
+        "AccountName=devstoreaccount1",
+        "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==",
+        f"BlobEndpoint=http://{host}:{port}/devstoreaccount1",
+    ]
+    return ";".join(parts)
+@pytest.fixture(scope="session")
+def azurite() -> Generator[AzuriteFixture, None, None]:
+    container = Azurite()
+    host, port = container.run()
+    try:
+        yield AzuriteFixture(
+            host=host,
+            port=port,
+            container=container.container_obj,
+            connection_string=get_connection_string(host, port),
+        )
+    finally:
+        container.stop()
+@pytest.fixture(scope="function")
+async def azure_storage(azurite):
+    storage = AzureStorage(
+        connection_string=azurite.connection_string,
+    )
+    MAIN[Utility.STORAGE] = storage
+    await storage.initialize()
+    try:
+        yield storage
+    finally:
+        await storage.finalize()
+        if Utility.STORAGE in MAIN:
+            del MAIN[Utility.STORAGE]

nucliadb_utils/transaction.py CHANGED Viewed

@@ -24,6 +24,7 @@ from functools import partial
 from typing import Any, Optional, Union
 import nats
+import nats.errors
 from nats.aio.client import Client
 from nats.js.client import JetStreamContext
@@ -53,6 +54,10 @@ class TransactionCommitTimeoutError(Exception):
     pass
+class MaxTransactionSizeExceededError(Exception):
+    pass
 class LocalTransactionUtility:
     async def commit(
         self,
@@ -195,7 +200,10 @@ class TransactionUtility:
         if target_subject is None:
             target_subject = const.Streams.INGEST.subject.format(partition=partition)
-        res = await self.js.publish(target_subject, writer.SerializeToString(), headers=headers)
+        try:
+            res = await self.js.publish(target_subject, writer.SerializeToString(), headers=headers)
+        except nats.errors.MaxPayloadError as ex:
+            raise MaxTransactionSizeExceededError() from ex
         waiting_for.seq = res.seq

nucliadb_utils/utilities.py CHANGED Viewed

@@ -98,7 +98,21 @@ async def get_storage(
     if Utility.STORAGE in MAIN:
         return MAIN[Utility.STORAGE]
-    if storage_settings.file_backend == FileBackendConfig.S3:
+    if storage_settings.file_backend == FileBackendConfig.AZURE:
+        from nucliadb_utils.storages.azure import AzureStorage
+        if storage_settings.azure_connection_string is None:
+            raise ConfigurationError("AZURE_CONNECTION_STRING env var not configured")
+        azureutil = AzureStorage(
+            connection_string=storage_settings.azure_connection_string,
+        )
+        logger.info("Configuring Azure Storage")
+        await azureutil.initialize()
+        set_utility(Utility.STORAGE, azureutil)
+    elif storage_settings.file_backend == FileBackendConfig.S3:
         from nucliadb_utils.storages.s3 import S3Storage
         s3util = S3Storage(

{nucliadb_utils-4.0.3.post590.dist-info → nucliadb_utils-4.0.3.post592.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nucliadb_utils
-Version: 4.0.3.post590
+Version: 4.0.3.post592
 Home-page: https://nuclia.com
 License: BSD
 Classifier: Development Status :: 4 - Beta
@@ -23,8 +23,8 @@ Requires-Dist: PyNaCl
 Requires-Dist: pyjwt >=2.4.0
 Requires-Dist: memorylru >=1.1.2
 Requires-Dist: mrflagly
-Requires-Dist: nucliadb-protos >=4.0.3.post590
-Requires-Dist: nucliadb-telemetry >=4.0.3.post590
+Requires-Dist: nucliadb-protos >=4.0.3.post592
+Requires-Dist: nucliadb-telemetry >=4.0.3.post592
 Provides-Extra: cache
 Requires-Dist: redis >=4.3.4 ; extra == 'cache'
 Requires-Dist: orjson >=3.6.7 ; extra == 'cache'
@@ -43,6 +43,7 @@ Requires-Dist: types-aiofiles >=0.8.3 ; extra == 'storages'
 Requires-Dist: aiofiles >=0.8.0 ; extra == 'storages'
 Requires-Dist: backoff >=1.11.1 ; extra == 'storages'
 Requires-Dist: google-auth >=2.4.1 ; extra == 'storages'
+Requires-Dist: azure-storage-blob >=12.20.0 ; extra == 'storages'
 # nucliadb util python library

{nucliadb_utils-4.0.3.post590.dist-info → nucliadb_utils-4.0.3.post592.dist-info}/RECORD RENAMED Viewed

@@ -12,11 +12,11 @@ nucliadb_utils/nats.py,sha256=zTAXECDXeCPtydk3F_6EMFDZ059kK0UYUU_tnWoxgXs,8208
 nucliadb_utils/partition.py,sha256=jBgy4Hu5Iwn4gjbPPcthSykwf-qNx-GcLAIwbzPd1d0,1157
 nucliadb_utils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nucliadb_utils/run.py,sha256=HpAIM8xbR7UpVC2_7xOjB4fYbUVykyPP6yHrv2RD3DI,1707
-nucliadb_utils/settings.py,sha256=fI3AOn30tNvYS_PqoKilVhJN4OppPAGCM6-OUUitO2s,7192
+nucliadb_utils/settings.py,sha256=VXZuq-4-RvsUsMIjL-wRRjzCqM-b_AnTHsW_hGrax_o,7281
 nucliadb_utils/signals.py,sha256=JRNv2y9zLtBjOANBf7krGfDGfOc9qcoXZ6N1nKWS2FE,2674
 nucliadb_utils/store.py,sha256=kQ35HemE0v4_Qg6xVqNIJi8vSFAYQtwI3rDtMsNy62Y,890
-nucliadb_utils/transaction.py,sha256=ym9hmPAoIt8xgxjd8JHG14_PelYTqhUOVfUAq_ghJDM,7100
-nucliadb_utils/utilities.py,sha256=E7W9TzvbyJ7_Yenho9CT059E_g4JQOCS02HrGurwNqs,13603
+nucliadb_utils/transaction.py,sha256=mwcI3aIHAvU5KOGqd_Uz_d1XQzXhk_-NWY8NqU1lfb0,7307
+nucliadb_utils/utilities.py,sha256=Vb7lXUSDtqS_7tNqI4CDQD2woHO_JFUGSjC2Yj4-uEA,14135
 nucliadb_utils/audit/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb_utils/audit/audit.py,sha256=dn5ZnCVQUlCcvdjzaORghbrjk9QgVGrtkfIftq30Bp8,2819
 nucliadb_utils/audit/basic.py,sha256=NViey6mKbCXqRTLDBX2xNTcCg9I-2e4oB2xkekuhDvM,3392
@@ -39,23 +39,27 @@ nucliadb_utils/nuclia_usage/protos/kb_usage_pb2_grpc.pyi,sha256=6RIsZ2934iodEckf
 nucliadb_utils/nuclia_usage/utils/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb_utils/nuclia_usage/utils/kb_usage_report.py,sha256=E1eUSFXBVNzQP9Q2rWj9y3koCO5S7iKwckny_AoLKuk,3870
 nucliadb_utils/storages/__init__.py,sha256=5Qc8AUWiJv9_JbGCBpAn88AIJhwDlm0OPQpg2ZdRL4U,872
-nucliadb_utils/storages/exceptions.py,sha256=BfJcn0-60Ts2gLHRTxQKD0QuR7L4WDJtIdsUp7zhQ0k,2395
-nucliadb_utils/storages/gcs.py,sha256=KQ9puMOE89CPIA8q8DeCs7qOp0YoB5ZctXPws1h7lbA,27006
-nucliadb_utils/storages/local.py,sha256=GAEzvbmLzEeEJhhIWKa-vX2i9B0qdq6mbHMolpa2Q20,10259
+nucliadb_utils/storages/azure.py,sha256=7cWbe144WIwt10IM6XPRThkjeg5M_U0WInj4Jk75THw,15591
+nucliadb_utils/storages/exceptions.py,sha256=mm_wX4YRtp7u7enkk_4pMSlX5AQQuFbq4xLmupVDt3Y,2502
+nucliadb_utils/storages/gcs.py,sha256=WblkxWoa1brevsJV3ebiE6s7Wb_eXFScw41202f5uP4,26999
+nucliadb_utils/storages/local.py,sha256=NxC_nMBd38NDsR266DSgoBLdQlvUwf0_sd50r-BLI0E,10288
 nucliadb_utils/storages/nuclia.py,sha256=vEv94xAT7QM2g80S25QyrOw2pzvP2BAX-ADgZLtuCVc,2097
-nucliadb_utils/storages/s3.py,sha256=ABzS9X3fj7SUq-3cLvnKEClngb8hcPyKNSfxubMpyCo,19256
+nucliadb_utils/storages/object_store.py,sha256=Tw10GmpYfM5TMqJ3Tk9pLQ9wLMBk1-snL_m6uasiZDQ,4257
+nucliadb_utils/storages/s3.py,sha256=8KV-V7EiqRYhXYlGN0UjzM-v1Pj2Zh7NtXDikG96knU,19272
 nucliadb_utils/storages/settings.py,sha256=ugCPy1zxBOmA2KosT-4tsjpvP002kg5iQyi42yCGCJA,1285
-nucliadb_utils/storages/storage.py,sha256=SWeQv6136ruj7TvCPQR6WkG458IDEz2fzQQjkDRRReQ,20533
+nucliadb_utils/storages/storage.py,sha256=Ask2f1xuQHxavF3uKXXrmjOeY7w3ZljpZlcvmIh2EVU,20060
+nucliadb_utils/storages/utils.py,sha256=8g2rIwJeYIumQLOB47Yw1rx3twlhRB_cJxer65QfZmk,1479
 nucliadb_utils/tests/__init__.py,sha256=Oo9CAE7B0eW5VHn8sHd6o30SQzOWUhktLPRXdlDOleA,1456
 nucliadb_utils/tests/asyncbenchmark.py,sha256=x4be2IwCawle9zWgMOJkmwoUwk5p1tv7cLQGmybkEOg,10587
+nucliadb_utils/tests/azure.py,sha256=ji-BV54m_MHAB9KdSToYgsZ8OZZ-C5Yq2VqWOYZNMs4,3668
 nucliadb_utils/tests/fixtures.py,sha256=j58fTvoWZClC52LX7QOvLXX9DS5QbytSnRp0F4nGzN8,1671
 nucliadb_utils/tests/gcs.py,sha256=Ii8BCHUAAxFIzX67pKTRFRgbqv3FJ6DrPAdAx2Xod1Y,3036
 nucliadb_utils/tests/indexing.py,sha256=YW2QhkhO9Q_8A4kKWJaWSvXvyQ_AiAwY1VylcfVQFxk,1513
 nucliadb_utils/tests/local.py,sha256=c3gZJJWmvOftruJkIQIwB3q_hh3uxEhqGIAVWim1Bbk,1343
 nucliadb_utils/tests/nats.py,sha256=Tosonm9A9cusImyji80G4pgdXEHNVPaCLT5TbFK_ra0,7543
 nucliadb_utils/tests/s3.py,sha256=YB8QqDaBXxyhHonEHmeBbRRDmvB7sTOaKBSi8KBGokg,2330
-nucliadb_utils-4.0.3.post590.dist-info/METADATA,sha256=G1YsUouvYuDg6mQyvsor1uvc9Wed9dCuvZxeRfBmBxw,2030
-nucliadb_utils-4.0.3.post590.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-nucliadb_utils-4.0.3.post590.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
-nucliadb_utils-4.0.3.post590.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-nucliadb_utils-4.0.3.post590.dist-info/RECORD,,
+nucliadb_utils-4.0.3.post592.dist-info/METADATA,sha256=BkJz8TUDW0BBDWiH06qFHh1kuB4u2QmRHwr33Pgix5M,2096
+nucliadb_utils-4.0.3.post592.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+nucliadb_utils-4.0.3.post592.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
+nucliadb_utils-4.0.3.post592.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+nucliadb_utils-4.0.3.post592.dist-info/RECORD,,

{nucliadb_utils-4.0.3.post590.dist-info → nucliadb_utils-4.0.3.post592.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb_utils-4.0.3.post590.dist-info → nucliadb_utils-4.0.3.post592.dist-info}/top_level.txt RENAMED Viewed

File without changes

{nucliadb_utils-4.0.3.post590.dist-info → nucliadb_utils-4.0.3.post592.dist-info}/zip-safe RENAMED Viewed

File without changes

nucliadb-utils 4.0.3.post590__py3-none-any.whl → 4.0.3.post592__py3-none-any.whl

nucliadb-utils 4.0.3.post590py3-none-any.whl → 4.0.3.post592py3-none-any.whl