PyPI - nucliadb-utils - Versions diffs - 4.0.3.post577__py3-none-any.whl → 4.0.3.post579__py3-none-any.whl - Mend

nucliadb-utils 4.0.3.post577py3-none-any.whl → 4.0.3.post579py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

nucliadb_utils/settings.py CHANGED Viewed

@@ -51,7 +51,6 @@ http_settings = HTTPSettings()
 class FileBackendConfig(Enum):
     GCS = "gcs"
     S3 = "s3"
-    PG = "pg"
     LOCAL = "local"
     NOT_SET = "notset"  # setting not provided

nucliadb_utils/storages/gcs.py CHANGED Viewed

@@ -26,7 +26,7 @@ import socket
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from datetime import datetime
-from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, cast
 from urllib.parse import quote_plus
 import aiohttp
@@ -50,6 +50,7 @@ from nucliadb_utils.storages.exceptions import (
 from nucliadb_utils.storages.storage import (
     ObjectInfo,
     ObjectMetadata,
+    Range,
     Storage,
     StorageField,
 )
@@ -162,11 +163,13 @@ class GCSStorageField(StorageField):
                 assert data["resource"]["name"] == destination_uri
     @storage_ops_observer.wrap({"type": "iter_data"})
-    async def iter_data(self, headers=None):
+    async def iter_data(
+        self, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
         attempt = 1
         while True:
             try:
-                async for chunk in self._inner_iter_data(headers=headers):
+                async for chunk in self._inner_iter_data(range=range):
                     yield chunk
                 break
             except ReadingResponseContentException:
@@ -185,23 +188,26 @@ class GCSStorageField(StorageField):
                 attempt += 1
     @storage_ops_observer.wrap({"type": "inner_iter_data"})
-    async def _inner_iter_data(self, headers=None):
-        if headers is None:
-            headers = {}
+    async def _inner_iter_data(self, range: Optional[Range] = None):
+        """
+        Iterate through object data.
+        """
+        range = range or Range()
+        assert self.storage.session is not None
+        headers = await self.storage.get_access_headers()
+        if range.any():
+            headers["Range"] = range.to_header()
         key = self.field.uri if self.field else self.key
         if self.field is None:
             bucket = self.bucket
         else:
             bucket = self.field.bucket_name
         url = "{}/{}/o/{}".format(
             self.storage.object_base_url,
             bucket,
             quote_plus(key),
         )
-        headers.update(await self.storage.get_access_headers())
         async with self.storage.session.get(
             url, headers=headers, params={"alt": "media"}, timeout=-1
         ) as api_resp:
@@ -209,11 +215,6 @@ class GCSStorageField(StorageField):
                 text = await api_resp.text()
                 if api_resp.status == 404:
                     raise KeyError(f"Google cloud file not found : \n {text}")
-                elif api_resp.status == 401:
-                    logger.warning(f"Invalid google cloud credentials error: {text}")
-                    raise KeyError(
-                        content={f"Google cloud invalid credentials : \n {text}"}
-                    )
                 raise GoogleCloudException(f"{api_resp.status}: {text}")
             while True:
                 try:
@@ -225,16 +226,6 @@ class GCSStorageField(StorageField):
                 else:
                     break
-    @storage_ops_observer.wrap({"type": "read_range"})
-    async def read_range(self, start: int, end: int) -> AsyncGenerator[bytes, None]:
-        """
-        Iterate through ranges of data
-        """
-        async for chunk in self.iter_data(
-            headers={"Range": f"bytes={start}-{end - 1}"}
-        ):
-            yield chunk
     @backoff.on_exception(
         backoff.expo,
         RETRIABLE_EXCEPTIONS,
@@ -443,18 +434,8 @@ class GCSStorageField(StorageField):
         async with self.storage.session.get(url, headers=headers) as api_resp:
             if api_resp.status == 200:
                 data = await api_resp.json()
-                metadata = data.get("metadata") or {}
-                metadata = {k.lower(): v for k, v in metadata.items()}
-                size = metadata.get("size") or data.get("size") or 0
-                content_type = (
-                    metadata.get("content_type") or data.get("contentType") or ""
-                )
-                filename = metadata.get("filename") or key.split("/")[-1]
-                return ObjectMetadata(
-                    filename=filename,
-                    size=int(size),
-                    content_type=content_type,
-                )
+                data = cast(dict[str, Any], data)
+                return parse_object_metadata(data, key)
             else:
                 return None
@@ -758,3 +739,31 @@ class GCSStorage(Storage):
                 for item in items:
                     yield ObjectInfo(name=item["name"])
                 page_token = data.get("nextPageToken")
+def parse_object_metadata(object_data: dict[str, Any], key: str) -> ObjectMetadata:
+    custom_metadata: dict[str, str] = object_data.get("metadata") or {}
+    # Lowercase all keys for backwards compatibility with old custom metadata
+    custom_metadata = {k.lower(): v for k, v in custom_metadata.items()}
+    # Parse size
+    custom_size = custom_metadata.get("size")
+    if not custom_size or custom_size == "0":
+        data_size = object_data.get("size")
+        size = int(data_size) if data_size else 0
+    else:
+        size = int(custom_size)
+    # Parse content-type
+    content_type = (
+        custom_metadata.get("content_type") or object_data.get("contentType") or ""
+    )
+    # Parse filename
+    filename = custom_metadata.get("filename") or key.split("/")[-1]
+    return ObjectMetadata(
+        filename=filename,
+        size=int(size),
+        content_type=content_type,
+    )

nucliadb_utils/storages/local.py CHANGED Viewed

@@ -24,7 +24,7 @@ import json
 import os
 import shutil
 from datetime import datetime
-from typing import AsyncGenerator, AsyncIterator, Dict, Optional
+from typing import AsyncGenerator, AsyncIterator, Optional
 import aiofiles
 from nucliadb_protos.resources_pb2 import CloudFile
@@ -33,6 +33,7 @@ from nucliadb_utils.storages import CHUNK_SIZE
 from nucliadb_utils.storages.storage import (
     ObjectInfo,
     ObjectMetadata,
+    Range,
     Storage,
     StorageField,
 )
@@ -77,7 +78,9 @@ class LocalStorageField(StorageField):
         destination_path = f"{destination_bucket_path}/{destination_uri}"
         shutil.copy(origin_path, destination_path)
-    async def iter_data(self, headers=None):
+    async def iter_data(
+        self, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
         key = self.field.uri if self.field else self.key
         if self.field is None:
             bucket = self.bucket
@@ -86,34 +89,36 @@ class LocalStorageField(StorageField):
         path = self.storage.get_file_path(bucket, key)
         async with aiofiles.open(path, mode="rb") as resp:
+            if range and range.start is not None:
+                # Seek to the start of the range
+                await resp.seek(range.start)
+            bytes_read = 0
+            bytes_to_read = None  # If None, read until EOF
+            if range and range.end is not None:
+                # Range is inclusive
+                bytes_to_read = range.end - (range.start or 0) + 1
             while True:
-                data = await resp.read(CHUNK_SIZE)
-                if not data:
+                chunk_size = CHUNK_SIZE
+                if bytes_to_read is not None:
+                    if bytes_read >= bytes_to_read:
+                        # Reached the end of the range
+                        break
+                    chunk_size = min(CHUNK_SIZE, bytes_to_read)
+                if chunk_size <= 0:
+                    # No more data to read
                     break
-                yield data
-    async def read_range(self, start: int, end: int) -> AsyncGenerator[bytes, None]:
-        """
-        Iterate through ranges of data
-        """
-        key = self.field.uri if self.field else self.key
-        if self.field is None:
-            bucket = self.bucket
-        else:
-            bucket = self.field.bucket_name
+                data = await resp.read(chunk_size)
+                if not data:
+                    # EOF
+                    break
-        path = self.storage.get_file_path(bucket, key)
-        async with aiofiles.open(path, "rb") as resp:
-            await resp.seek(start)
-            count = 0
-            data = await resp.read(CHUNK_SIZE)
-            while data and count < end:
-                if count + len(data) > end:
-                    new_end = end - count
-                    data = data[:new_end]
                 yield data
-                count += len(data)
-                data = await resp.read(CHUNK_SIZE)
+                bytes_read += len(data)
     async def start(self, cf: CloudFile) -> CloudFile:
         if self.field is not None and self.field.upload_uri != "":
@@ -285,17 +290,9 @@ class LocalStorage(Storage):
         for key in glob.glob(f"{bucket}/{prefix}*"):
             yield ObjectInfo(name=key)
-    async def download(
-        self, bucket_name: str, key: str, headers: Optional[Dict[str, str]] = None
-    ):
+    async def download(self, bucket_name: str, key: str, range: Optional[Range] = None):
         key_path = self.get_file_path(bucket_name, key)
         if not os.path.exists(key_path):
             return
-        async with aiofiles.open(key_path, mode="rb") as f:
-            while True:
-                body = await f.read(self.chunk_size)
-                if body == b"" or body is None:
-                    break
-                else:
-                    yield body
+        async for chunk in super().download(bucket_name, key, range=range):
+            yield chunk

nucliadb_utils/storages/s3.py CHANGED Viewed

@@ -37,6 +37,7 @@ from nucliadb_utils.storages.exceptions import UnparsableResponse
 from nucliadb_utils.storages.storage import (
     ObjectInfo,
     ObjectMetadata,
+    Range,
     Storage,
     StorageField,
 )
@@ -81,15 +82,21 @@ class S3StorageField(StorageField):
         jitter=backoff.random_jitter,
         max_tries=MAX_TRIES,
     )
-    async def _download(self, uri, bucket, **kwargs):
-        if "headers" in kwargs:
-            for key, value in kwargs["headers"].items():
-                kwargs[key] = value
-            del kwargs["headers"]
-        try:
-            return await self.storage._s3aioclient.get_object(
-                Bucket=bucket, Key=uri, **kwargs
+    async def _download(
+        self,
+        uri,
+        bucket,
+        range: Optional[Range] = None,
+    ):
+        range = range or Range()
+        if range.any():
+            coro = self.storage._s3aioclient.get_object(
+                Bucket=bucket, Key=uri, Range=range.to_header()
             )
+        else:
+            coro = self.storage._s3aioclient.get_object(Bucket=bucket, Key=uri)
+        try:
+            return await coro
         except botocore.exceptions.ClientError as e:
             error_code = parse_status_code(e)
             if error_code == 404:
@@ -97,18 +104,16 @@ class S3StorageField(StorageField):
             else:
                 raise
-    async def iter_data(self, **kwargs):
+    async def iter_data(
+        self, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
         # Suports field and key based iter
         uri = self.field.uri if self.field else self.key
         if self.field is None:
             bucket = self.bucket
         else:
             bucket = self.field.bucket_name
-        downloader = await self._download(uri, bucket, **kwargs)
-        # we do not want to timeout ever from this...
-        # downloader['Body'].set_socket_timeout(999999)
+        downloader = await self._download(uri, bucket, range=range)
         stream = downloader["Body"]
         data = await stream.read(CHUNK_SIZE)
         while True:
@@ -117,13 +122,6 @@ class S3StorageField(StorageField):
             yield data
             data = await stream.read(CHUNK_SIZE)
-    async def read_range(self, start: int, end: int) -> AsyncGenerator[bytes, None]:
-        """
-        Iterate through ranges of data
-        """
-        async for chunk in self.iter_data(Range=f"bytes={start}-{end - 1}"):
-            yield chunk
     async def _abort_multipart(self):
         try:
             mpu = self.field.resumable_uri
@@ -296,18 +294,9 @@ class S3StorageField(StorageField):
         try:
             obj = await self.storage._s3aioclient.head_object(Bucket=bucket, Key=key)
-            if obj is not None:
-                metadata = obj.get("Metadata") or {}
-                size = metadata.get("size") or obj.get("ContentLength") or 0
-                content_type = (
-                    metadata.get("content_type") or obj.get("ContentType") or ""
-                )
-                filename = metadata.get("filename") or key.split("/")[-1]
-                return ObjectMetadata(
-                    size=int(size), content_type=content_type, filename=filename
-                )
-            else:
+            if obj is None:
                 return None
+            return parse_object_metadata(obj, key)
         except botocore.exceptions.ClientError as e:
             error_code = parse_status_code(e)
             if error_code == 404:
@@ -560,3 +549,21 @@ def parse_status_code(error: botocore.exceptions.ClientError) -> int:
         errors.capture_message(msg, "error", scope)
     raise UnparsableResponse(msg) from error
+def parse_object_metadata(obj: dict, key: str) -> ObjectMetadata:
+    custom_metadata = obj.get("Metadata") or {}
+    # Parse size
+    custom_size = custom_metadata.get("size")
+    if custom_size is None or custom_size == "0":
+        size = 0
+        content_lenght = obj.get("ContentLength")
+        if content_lenght is not None:
+            size = int(content_lenght)
+    else:
+        size = int(custom_size)
+    # Content type
+    content_type = custom_metadata.get("content_type") or obj.get("ContentType") or ""
+    # Filename
+    filename = custom_metadata.get("filename") or key.split("/")[-1]
+    return ObjectMetadata(size=size, content_type=content_type, filename=filename)

nucliadb_utils/storages/storage.py CHANGED Viewed

@@ -22,12 +22,12 @@ from __future__ import annotations
 import abc
 import hashlib
 import uuid
+from dataclasses import dataclass
 from io import BytesIO
 from typing import (
     Any,
     AsyncGenerator,
     AsyncIterator,
-    Dict,
     List,
     Optional,
     Tuple,
@@ -71,6 +71,23 @@ class ObjectMetadata(BaseModel):
     size: int
+@dataclass
+class Range:
+    """
+    Represents a range of bytes to be downloaded from a file. The range is inclusive.
+    The start and end values are 0-based.
+    """
+    start: Optional[int] = None
+    end: Optional[int] = None
+    def any(self) -> bool:
+        return self.start is not None or self.end is not None
+    def to_header(self) -> str:
+        return f"bytes={self.start or 0}-{self.end or ''}"
 class StorageField(abc.ABC, metaclass=abc.ABCMeta):
     storage: Storage
     bucket: str
@@ -93,12 +110,9 @@ class StorageField(abc.ABC, metaclass=abc.ABCMeta):
     async def upload(self, iterator: AsyncIterator, origin: CloudFile) -> CloudFile: ...
     @abc.abstractmethod
-    async def iter_data(self, headers=None) -> AsyncGenerator[bytes, None]:  # type: ignore
-        raise NotImplementedError()
-        yield b""
-    @abc.abstractmethod
-    async def read_range(self, start: int, end: int) -> AsyncGenerator[bytes, None]:
+    async def iter_data(
+        self, range: Optional[Range] = None
+    ) -> AsyncGenerator[bytes, None]:
         raise NotImplementedError()
         yield b""
@@ -433,16 +447,16 @@ class Storage(abc.ABC, metaclass=abc.ABCMeta):
         return await destination.upload(safe_iterator, origin)
     async def download(
-        self, bucket: str, key: str, headers: Optional[Dict[str, str]] = None
+        self,
+        bucket: str,
+        key: str,
+        range: Optional[Range] = None,
     ):
         destination: StorageField = self.field_klass(
             storage=self, bucket=bucket, fullkey=key
         )
-        if headers is None:
-            headers = {}
         try:
-            async for data in destination.iter_data(headers=headers):
+            async for data in destination.iter_data(range=range):
                 yield data
         except KeyError:
             yield None

nucliadb_utils/tests/fixtures.py CHANGED Viewed

@@ -33,8 +33,8 @@ def lazy_storage_fixture():
         return [lazy_fixture.lf("gcs_storage")]
     elif backend == "s3":
         return [lazy_fixture.lf("s3_storage")]
-    elif backend == "pg":
-        return [lazy_fixture.lf("pg_storage")]
+    elif backend == "local":
+        return [lazy_fixture.lf("local_storage")]
     else:
         print(f"Unknown storage backend {backend}, using gcs")
         return [lazy_fixture.lf("gcs_storage")]

nucliadb_utils/utilities.py CHANGED Viewed

@@ -138,17 +138,6 @@ async def get_storage(
         await gcsutil.initialize(service_name)
         logger.info("Configuring GCS Storage")
-    elif storage_settings.file_backend == FileBackendConfig.PG:
-        from nucliadb_utils.storages.pg import PostgresStorage
-        pgutil = PostgresStorage(
-            storage_settings.driver_pg_url,  # type: ignore
-            connection_pool_max_size=storage_settings.driver_pg_connection_pool_max_size,
-        )
-        set_utility(Utility.STORAGE, pgutil)
-        await pgutil.initialize()
-        logger.info("Configuring Postgres Storage")
     elif storage_settings.file_backend == FileBackendConfig.LOCAL:
         if storage_settings.local_files is None:
             raise ConfigurationError("LOCAL_FILES env var not configured")

{nucliadb_utils-4.0.3.post577.dist-info → nucliadb_utils-4.0.3.post579.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nucliadb_utils
-Version: 4.0.3.post577
+Version: 4.0.3.post579
 Home-page: https://nuclia.com
 License: BSD
 Classifier: Development Status :: 4 - Beta
@@ -23,8 +23,8 @@ Requires-Dist: PyNaCl
 Requires-Dist: pyjwt >=2.4.0
 Requires-Dist: memorylru >=1.1.2
 Requires-Dist: mrflagly
-Requires-Dist: nucliadb-protos >=4.0.3.post577
-Requires-Dist: nucliadb-telemetry >=4.0.3.post577
+Requires-Dist: nucliadb-protos >=4.0.3.post579
+Requires-Dist: nucliadb-telemetry >=4.0.3.post579
 Provides-Extra: cache
 Requires-Dist: redis >=4.3.4 ; extra == 'cache'
 Requires-Dist: orjson >=3.6.7 ; extra == 'cache'

{nucliadb_utils-4.0.3.post577.dist-info → nucliadb_utils-4.0.3.post579.dist-info}/RECORD RENAMED Viewed

@@ -12,11 +12,11 @@ nucliadb_utils/nats.py,sha256=7hRKMflwxK-p_L0KFO5jibOGhzSw2F24mKvPG-A_iN8,8224
 nucliadb_utils/partition.py,sha256=0tmXuwRM_v-OmKoRkB--OMDzomVEkmgxqMmNNomI_24,1173
 nucliadb_utils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nucliadb_utils/run.py,sha256=bKMfsPEK6WdWfiPyWPUxCqcLo4tq6eOwyaf910TOwBk,1713
-nucliadb_utils/settings.py,sha256=Dd6h_BavMkt3e8qA3btZjt2si11x7tvpOC_WYMxqrDM,7252
+nucliadb_utils/settings.py,sha256=WVL2u_jCkm7Uf6a2njOZetHM_nU0hwDVhLqfH0k5Yi4,7238
 nucliadb_utils/signals.py,sha256=5r53hZvZmwgKdri5jHEjuHmiaq5TyusUUvjoq2uliIc,2704
 nucliadb_utils/store.py,sha256=kQ35HemE0v4_Qg6xVqNIJi8vSFAYQtwI3rDtMsNy62Y,890
 nucliadb_utils/transaction.py,sha256=CQpsuF-E2omh4gGMxXCn0dv7vL9ctxooWpSgWGbGfBA,7212
-nucliadb_utils/utilities.py,sha256=s5MVXDj4DTtc1VPFBRxMjud3HB0xkadyQ0f7QQLb0NM,14178
+nucliadb_utils/utilities.py,sha256=mbwU6BWoUNoTiWDeUqWK-VCtOM8ZFAyPz0SX7Q818PY,13724
 nucliadb_utils/audit/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb_utils/audit/audit.py,sha256=fmEVb6ahKrkGAY-GEy4_L4ccmcGM5YKl-Vs05260_cg,2834
 nucliadb_utils/audit/basic.py,sha256=8yL7HI9MnykSt7j4QbUeRBbBsTKFIIX6hppJ3ADVLdM,3430
@@ -40,24 +40,22 @@ nucliadb_utils/nuclia_usage/utils/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZ
 nucliadb_utils/nuclia_usage/utils/kb_usage_report.py,sha256=E1eUSFXBVNzQP9Q2rWj9y3koCO5S7iKwckny_AoLKuk,3870
 nucliadb_utils/storages/__init__.py,sha256=5Qc8AUWiJv9_JbGCBpAn88AIJhwDlm0OPQpg2ZdRL4U,872
 nucliadb_utils/storages/exceptions.py,sha256=n6aBOyurWMo8mXd1XY6Psgno4VfXJ9TRbxCy67c08-g,2417
-nucliadb_utils/storages/gcs.py,sha256=krBkNd7wkHhfIn3T-4QvYu1Rw-envYCa6G4G90oOjvM,27303
-nucliadb_utils/storages/local.py,sha256=JewYQ-fes9iUtUjlbHgWXrG1RsQWh16TJDunJnwfbTg,10447
+nucliadb_utils/storages/gcs.py,sha256=JcIL9gQ1YCXtNkuEhFciP_VcgyWcy4e4xuN01d2eZIg,27372
+nucliadb_utils/storages/local.py,sha256=nDrmWy1na96AS__hO3TQqsYMHnu0buwnfUGWfxCpWYU,10348
 nucliadb_utils/storages/nuclia.py,sha256=UfvRu92eqG1v-PE-UWH2x8KEJFqDqATMmUGFmEuqSSs,2097
-nucliadb_utils/storages/pg.py,sha256=DxXNwcstAFOTC6kaXlWp-b4WrvR8aSSOfgVJNDQ5oDI,18976
-nucliadb_utils/storages/s3.py,sha256=f2bjgmT6JRlUr5DHy3tRUip4kYSA1MzXfYrLNVUp_Cg,19447
+nucliadb_utils/storages/s3.py,sha256=RRbcYr4FE-Vfisr-zPoUN0Q_LfEHF-L2B0ggFuVsOwU,19500
 nucliadb_utils/storages/settings.py,sha256=ugCPy1zxBOmA2KosT-4tsjpvP002kg5iQyi42yCGCJA,1285
-nucliadb_utils/storages/storage.py,sha256=sR2Qvev6eLUvbH1WTXjqXIOnKRy1YMMx6Vsj0wZ2x8A,20585
+nucliadb_utils/storages/storage.py,sha256=KJ5VDYoZuRmiCFwfLj__tDOHIJWyQAMi-sOXCMoJv9w,20831
 nucliadb_utils/tests/__init__.py,sha256=Oo9CAE7B0eW5VHn8sHd6o30SQzOWUhktLPRXdlDOleA,1456
 nucliadb_utils/tests/asyncbenchmark.py,sha256=rN_NNDk4ras0qgFp0QlRyAi9ZU9xITdzxl2s5CigzBo,10698
-nucliadb_utils/tests/fixtures.py,sha256=ZvKaxZFMULC2Sbo0jSIuGxJW_cgVH_pNjhVYo9PbgyA,1665
+nucliadb_utils/tests/fixtures.py,sha256=j58fTvoWZClC52LX7QOvLXX9DS5QbytSnRp0F4nGzN8,1671
 nucliadb_utils/tests/gcs.py,sha256=1dbt_zG3uZPZDF3Nyrgrvi_bsKmafAUOm4Pu4bzt7wI,3098
 nucliadb_utils/tests/indexing.py,sha256=YW2QhkhO9Q_8A4kKWJaWSvXvyQ_AiAwY1VylcfVQFxk,1513
 nucliadb_utils/tests/local.py,sha256=c3gZJJWmvOftruJkIQIwB3q_hh3uxEhqGIAVWim1Bbk,1343
 nucliadb_utils/tests/nats.py,sha256=lgRe6YH9LSoI7XgcyKAC2VTSAtuu8EeMve0jWWC_kOY,7701
-nucliadb_utils/tests/pg.py,sha256=HBpvaNDs9T_L55tvJCJTPnsCDrB8ehI_9HRYz6SPWNE,1819
 nucliadb_utils/tests/s3.py,sha256=YB8QqDaBXxyhHonEHmeBbRRDmvB7sTOaKBSi8KBGokg,2330
-nucliadb_utils-4.0.3.post577.dist-info/METADATA,sha256=prh_IQhq0bc4fqgqeIUWgedwV_OyVzxFhH0_xSa1_Y0,2030
-nucliadb_utils-4.0.3.post577.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-nucliadb_utils-4.0.3.post577.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
-nucliadb_utils-4.0.3.post577.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-nucliadb_utils-4.0.3.post577.dist-info/RECORD,,
+nucliadb_utils-4.0.3.post579.dist-info/METADATA,sha256=5OY05fts98E0YRW0yVwWXMwjWbcJwxlw8KU50Q2FVNI,2030
+nucliadb_utils-4.0.3.post579.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+nucliadb_utils-4.0.3.post579.dist-info/top_level.txt,sha256=fE3vJtALTfgh7bcAWcNhcfXkNPp_eVVpbKK-2IYua3E,15
+nucliadb_utils-4.0.3.post579.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+nucliadb_utils-4.0.3.post579.dist-info/RECORD,,

nucliadb_utils/storages/pg.py DELETED Viewed

@@ -1,617 +0,0 @@
-# Copyright (C) 2021 Bosutech XXI S.L.
-#
-# nucliadb is offered under the AGPL v3.0 and as commercial software.
-# For commercial licensing, contact us at info@nuclia.com.
-#
-# AGPL:
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-from __future__ import annotations
-import asyncio
-import logging
-import uuid
-from typing import Any, AsyncGenerator, AsyncIterator, Optional, TypedDict
-import asyncpg
-from nucliadb_protos.resources_pb2 import CloudFile
-from nucliadb_utils.storages import CHUNK_SIZE
-from nucliadb_utils.storages.storage import (
-    ObjectInfo,
-    ObjectMetadata,
-    Storage,
-    StorageField,
-)
-logger = logging.getLogger(__name__)
-# Table design notes
-# - No foreign key constraints ON PURPOSE
-# - No cascade handling ON PURPOSE
-CREATE_TABLE = """
-CREATE TABLE IF NOT EXISTS kb_files (
-    kb_id TEXT,
-    file_id TEXT,
-    filename TEXT,
-    size INTEGER,
-    content_type TEXT,
-    PRIMARY KEY(kb_id, file_id)
-);
-CREATE TABLE IF NOT EXISTS kb_files_fileparts (
-    kb_id TEXT,
-    file_id TEXT,
-    part_id INTEGER,
-    size INTEGER,
-    data BYTEA,
-    PRIMARY KEY(kb_id, file_id, part_id)
-);
-"""
-class FileInfo(TypedDict):
-    filename: str
-    size: int
-    content_type: str
-    key: str
-class ChunkInfo(TypedDict):
-    part_id: int
-    size: int
-class Chunk(ChunkInfo):
-    data: bytes
-class PostgresFileDataLayer:
-    """
-    Responsible for interating with the database and
-    abstracting any sql and connection management.
-    """
-    def __init__(self, connection: asyncpg.Connection):
-        self.connection = connection
-    async def initialize_kb(self, kbid: str) -> bool:
-        # there's really no record keeping or init
-        # per kb that we care to do
-        return True
-    async def delete_kb(self, kbid: str) -> bool:
-        async with self.connection.transaction():
-            await self.connection.execute(
-                """
-DELETE FROM kb_files
-WHERE kb_id = $1
-""",
-                kbid,
-            )
-            await self.connection.execute(
-                """
-DELETE FROM kb_files_fileparts
-WHERE kb_id = $1
-""",
-                kbid,
-            )
-        return True
-    async def create_file(
-        self, *, kb_id: str, file_id: str, filename: str, size: int, content_type: str
-    ) -> None:
-        async with self.connection.transaction():
-            await self.connection.execute(
-                """
-INSERT INTO kb_files (kb_id, file_id, filename, size, content_type)
-VALUES ($1, $2, $3, $4, $5)
-""",
-                kb_id,
-                file_id,
-                filename or "",
-                size,
-                content_type or "",
-            )
-    async def delete_file(self, kb_id: str, file_id: str) -> None:
-        async with self.connection.transaction():
-            await self.connection.execute(
-                """
-DELETE FROM kb_files
-WHERE kb_id = $1 AND file_id = $2
-""",
-                kb_id,
-                file_id,
-            )
-            await self.connection.execute(
-                """
-DELETE FROM kb_files_fileparts
-WHERE kb_id = $1 AND file_id = $2
-""",
-                kb_id,
-                file_id,
-            )
-    async def append_chunk(self, *, kb_id: str, file_id: str, data: bytes) -> None:
-        async with self.connection.transaction():
-            await self.connection.execute(
-                """
-INSERT INTO kb_files_fileparts (kb_id, file_id, part_id, data, size)
-VALUES (
-    $1, $2,
-    (
-        SELECT COALESCE(MAX(part_id), 0) + 1
-        FROM kb_files_fileparts WHERE kb_id = $1 AND file_id = $2
-    ),
-    $3, $4)
-""",
-                kb_id,
-                file_id,
-                data,
-                len(data),
-            )
-    async def get_file_info(self, kb_id: str, file_id: str) -> Optional[FileInfo]:
-        record = await self.connection.fetchrow(
-            """
-SELECT filename, size, content_type, file_id
-FROM kb_files
-WHERE kb_id = $1 AND file_id = $2
-""",
-            kb_id,
-            file_id,
-        )
-        if record is None:
-            return None
-        return FileInfo(
-            filename=record["filename"],
-            size=record["size"],
-            content_type=record["content_type"],
-            key=record["file_id"],
-        )
-    async def move(
-        self,
-        *,
-        origin_key: str,
-        destination_key: str,
-        origin_kb: str,
-        destination_kb: str,
-    ):
-        async with self.connection.transaction():
-            # make sure to delete the destination first in
-            # case this is an overwrite of an existing
-            await self.connection.execute(
-                """
-delete from kb_files
-WHERE kb_id = $1 AND file_id = $2
-""",
-                destination_kb,
-                destination_key,
-            )
-            await self.connection.execute(
-                """
-UPDATE kb_files
-SET kb_id = $1, file_id = $2
-WHERE kb_id = $3 AND file_id = $4
-""",
-                destination_kb,
-                destination_key,
-                origin_kb,
-                origin_key,
-            )
-            # make sure to delete the destination first in
-            # case this is an overwrite of an existing
-            await self.connection.execute(
-                """
-delete from kb_files_fileparts
-WHERE kb_id = $1 AND file_id = $2
-""",
-                destination_kb,
-                destination_key,
-            )
-            await self.connection.execute(
-                """
-UPDATE kb_files_fileparts
-SET kb_id = $1, file_id = $2
-WHERE kb_id = $3 AND file_id = $4
-""",
-                destination_kb,
-                destination_key,
-                origin_kb,
-                origin_key,
-            )
-    async def copy(
-        self,
-        *,
-        origin_key: str,
-        destination_key: str,
-        origin_kb: str,
-        destination_kb: str,
-    ):
-        async with self.connection.transaction():
-            await self.connection.execute(
-                """
-INSERT INTO kb_files (kb_id, file_id, filename, size, content_type)
-SELECT $1, $2, filename, size, content_type
-FROM kb_files
-WHERE kb_id = $3 AND file_id = $4
-""",
-                destination_kb,
-                destination_key,
-                origin_kb,
-                origin_key,
-            )
-            await self.connection.execute(
-                """
-INSERT INTO kb_files_fileparts (kb_id, file_id, part_id, data, size)
-SELECT $1, $2, part_id, data, size
-FROM kb_files_fileparts
-WHERE kb_id = $3 AND file_id = $4
-""",
-                destination_kb,
-                destination_key,
-                origin_kb,
-                origin_key,
-            )
-    async def get_chunks_info(
-        self, bucket: str, key: str, part_ids: Optional[list[int]] = None
-    ) -> list[ChunkInfo]:
-        query = """
-select kb_id, file_id, part_id, size
-from kb_files_fileparts
-where kb_id = $1 and file_id = $2
-"""
-        args: list[Any] = [bucket, key]
-        if part_ids is not None:
-            query += " and part_id = ANY($3)"
-            args.append(part_ids)
-        query += " order by part_id"
-        chunks = await self.connection.fetch(query, *args)
-        return [
-            ChunkInfo(
-                part_id=chunk["part_id"],
-                size=chunk["size"],
-            )
-            for chunk in chunks
-        ]
-    async def iterate_kb(
-        self, bucket: str, prefix: Optional[str] = None
-    ) -> AsyncGenerator[FileInfo, None]:
-        query = """
-SELECT filename, size, content_type, file_id
-FROM kb_files
-WHERE kb_id = $1
-"""
-        args: list[Any] = [bucket]
-        if prefix:
-            query += " AND filename LIKE $2"
-            args.append(prefix + "%")
-        async with self.connection.transaction():
-            async for record in self.connection.cursor(query, *args):
-                yield FileInfo(
-                    filename=record["filename"],
-                    size=record["size"],
-                    content_type=record["content_type"],
-                    key=record["file_id"],
-                )
-    async def iterate_chunks(
-        self, bucket: str, key: str, part_ids: Optional[list[int]] = None
-    ) -> AsyncIterator[Chunk]:
-        chunks = await self.get_chunks_info(bucket, key, part_ids=part_ids)
-        for chunk in chunks:
-            # who knows how long a download for one of these chunks could be,
-            # so let's not try to keep a txn or cursor open.
-            data_chunk = await self.connection.fetchrow(
-                """
-select data
-from kb_files_fileparts
-where kb_id = $1 and file_id = $2 and part_id = $3
-""",
-                bucket,
-                key,
-                chunk["part_id"],
-            )
-            yield Chunk(
-                part_id=chunk["part_id"],
-                size=chunk["size"],
-                data=data_chunk["data"],
-            )
-    async def iterate_range(
-        self, *, kb_id: str, file_id: str, start: int, end: int
-    ) -> AsyncIterator[bytes]:
-        chunks = await self.get_chunks_info(
-            kb_id,
-            file_id,
-        )
-        # First off, find start part and position
-        elapsed = 0
-        start_part_id = None
-        start_pos = -1
-        for chunk in chunks:
-            if elapsed + chunk["size"] > start:
-                start_part_id = chunk["part_id"]
-                start_pos = start - elapsed
-                break
-            else:
-                elapsed += chunk["size"]
-        if start_part_id is None:
-            return
-        # Now, iterate through the chunks and yield the data
-        read_bytes = 0
-        while read_bytes < end - start:
-            data_chunk = await self.connection.fetchrow(
-                """
-select data
-from kb_files_fileparts
-where kb_id = $1 and file_id = $2 and part_id = $3
-""",
-                kb_id,
-                file_id,
-                start_part_id,
-            )
-            if data_chunk is None:
-                return
-            data = data_chunk["data"][
-                start_pos : min(
-                    start_pos + ((end - start) - read_bytes), len(data_chunk["data"])
-                )
-            ]
-            read_bytes += len(data)
-            yield data
-            start_pos = 0
-            start_part_id += 1
-class PostgresStorageField(StorageField):
-    storage: PostgresStorage
-    async def move(
-        self,
-        origin_uri: str,
-        destination_uri: str,
-        origin_bucket_name: str,
-        destination_bucket_name: str,
-    ):
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            return await dl.move(
-                origin_key=origin_uri,
-                destination_key=destination_uri,
-                origin_kb=origin_bucket_name,
-                destination_kb=destination_bucket_name,
-            )
-    async def copy(
-        self,
-        origin_uri: str,
-        destination_uri: str,
-        origin_bucket_name: str,
-        destination_bucket_name: str,
-    ):
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            return await dl.copy(
-                origin_key=origin_uri,
-                destination_key=destination_uri,
-                origin_kb=origin_bucket_name,
-                destination_kb=destination_bucket_name,
-            )
-    async def iter_data(self, headers=None):
-        key = self.field.uri if self.field else self.key
-        if self.field is None:
-            bucket = self.bucket
-        else:
-            bucket = self.field.bucket_name
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            async for chunk in dl.iterate_chunks(bucket, key):
-                yield chunk["data"]
-    async def read_range(self, start: int, end: int) -> AsyncGenerator[bytes, None]:
-        """
-        Iterate through ranges of data
-        """
-        key = self.field.uri if self.field else self.key
-        if self.field is None:
-            bucket = self.bucket
-        else:
-            bucket = self.field.bucket_name
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            async for data in dl.iterate_range(
-                kb_id=bucket, file_id=key, start=start, end=end
-            ):
-                yield data
-    async def start(self, cf: CloudFile) -> CloudFile:
-        field = CloudFile(
-            filename=cf.filename,
-            size=cf.size,
-            md5=cf.md5,
-            content_type=cf.content_type,
-            bucket_name=self.bucket,
-            source=CloudFile.POSTGRES,
-        )
-        upload_uri = uuid.uuid4().hex
-        async with self.storage.pool.acquire() as conn:
-            async with conn.transaction():
-                dl = PostgresFileDataLayer(conn)
-                if self.field is not None and self.field.upload_uri != "":
-                    # If there is a temporal url
-                    await dl.delete_file(self.field.bucket_name, self.field.upload_uri)
-                await dl.create_file(
-                    kb_id=self.bucket,
-                    file_id=upload_uri,
-                    filename=cf.filename,
-                    size=cf.size,
-                    content_type=cf.content_type,
-                )
-                field.offset = 0
-                field.upload_uri = upload_uri
-                return field
-    async def append(self, cf: CloudFile, iterable: AsyncIterator) -> int:
-        if self.field is None:
-            raise AttributeError()
-        count = 0
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            async for chunk in iterable:
-                await dl.append_chunk(
-                    kb_id=self.bucket,
-                    file_id=cf.upload_uri or self.field.upload_uri,
-                    data=chunk,
-                )
-                size = len(chunk)
-                count += size
-                self.field.offset += len(chunk)
-        return count
-    async def finish(self):
-        async with self.storage.pool.acquire() as conn, conn.transaction():
-            dl = PostgresFileDataLayer(conn)
-            if self.field.old_uri not in ("", None):
-                # Already has a file
-                await dl.delete_file(self.bucket, self.field.uri)
-            if self.field.upload_uri != self.key:
-                try:
-                    await dl.move(
-                        origin_key=self.field.upload_uri,
-                        destination_key=self.key,
-                        origin_kb=self.field.bucket_name,
-                        destination_kb=self.bucket,
-                    )
-                except Exception:
-                    logger.exception(
-                        f"Error moving file {self.field.bucket_name}://{self.field.upload_uri} -> {self.bucket}://{self.key}"  # noqa
-                    )
-                    raise
-        self.field.uri = self.key
-        self.field.ClearField("offset")
-        self.field.ClearField("upload_uri")
-    async def exists(self) -> Optional[ObjectMetadata]:
-        async with self.storage.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            file_info = await dl.get_file_info(self.bucket, self.key)
-            if file_info is None:
-                return None
-            return ObjectMetadata(
-                filename=file_info["filename"],
-                size=file_info["size"],
-                content_type=file_info["content_type"],
-            )
-    async def upload(self, iterator: AsyncIterator, origin: CloudFile) -> CloudFile:
-        self.field = await self.start(origin)
-        await self.append(origin, iterator)
-        await self.finish()
-        return self.field
-    def __repr__(self):
-        return f"{self.storage.source}: {self.bucket}/{self.key}"
-class PostgresStorage(Storage):
-    field_klass = PostgresStorageField
-    chunk_size = CHUNK_SIZE
-    pool: asyncpg.pool.Pool
-    def __init__(self, dsn: str, connection_pool_max_size: int = 10):
-        self.dsn = dsn
-        self.connection_pool_max_size = connection_pool_max_size
-        self.source = CloudFile.POSTGRES
-        self._lock = asyncio.Lock()
-        self.initialized = False
-    async def initialize(self):
-        async with self._lock:
-            if self.initialized is False:
-                self.pool = await asyncpg.create_pool(
-                    self.dsn,
-                    max_size=self.connection_pool_max_size,
-                )
-                # check if table exists
-                try:
-                    async with self.pool.acquire() as conn:
-                        await conn.execute(CREATE_TABLE)
-                except asyncpg.exceptions.UniqueViolationError:  # pragma: no cover
-                    pass
-            self.initialized = True
-    async def finalize(self):
-        async with self._lock:
-            await self.pool.close()
-            self.initialized = False
-    def get_bucket_name(self, kbid: str):
-        return kbid
-    async def create_kb(self, kbid: str):
-        async with self.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            return await dl.initialize_kb(kbid)
-    async def delete_kb(self, kbid: str) -> tuple[bool, bool]:
-        async with self.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            return await dl.delete_kb(kbid), False
-    async def delete_upload(self, uri: str, bucket_name: str):
-        async with self.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            await dl.delete_file(bucket_name, uri)
-    async def schedule_delete_kb(self, kbid: str) -> bool:
-        await self.delete_kb(kbid)
-        return True
-    async def iterate_objects(
-        self, bucket: str, prefix: str
-    ) -> AsyncGenerator[ObjectInfo, None]:
-        async with self.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            async for file_data in dl.iterate_kb(bucket, prefix):
-                yield ObjectInfo(name=file_data["key"])
-    async def download(
-        self, bucket_name: str, key: str, headers: Optional[dict[str, str]] = None
-    ) -> AsyncIterator[bytes]:
-        async with self.pool.acquire() as conn:
-            dl = PostgresFileDataLayer(conn)
-            async for chunk in dl.iterate_chunks(bucket_name, key):
-                yield chunk["data"]

nucliadb_utils/tests/pg.py DELETED Viewed

@@ -1,57 +0,0 @@
-# Copyright (C) 2021 Bosutech XXI S.L.
-#
-# nucliadb is offered under the AGPL v3.0 and as commercial software.
-# For commercial licensing, contact us at info@nuclia.com.
-#
-# AGPL:
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-import asyncpg
-import pytest
-from pytest_docker_fixtures import images  # type: ignore
-from nucliadb_utils.storages.pg import PostgresStorage
-from nucliadb_utils.store import MAIN
-from nucliadb_utils.utilities import Utility
-images.settings["postgresql"].update(
-    {
-        "version": "16.1",
-        "env": {
-            "POSTGRES_PASSWORD": "postgres",
-            "POSTGRES_DB": "postgres",
-            "POSTGRES_USER": "postgres",
-        },
-    }
-)
-@pytest.fixture(scope="function")
-async def pg_storage(pg):
-    dsn = f"postgresql://postgres:postgres@{pg[0]}:{pg[1]}/postgres"
-    storage = PostgresStorage(dsn)
-    MAIN[Utility.STORAGE] = storage
-    conn = await asyncpg.connect(dsn)
-    await conn.execute(
-        """
-DROP table IF EXISTS kb_files;
-DROP table IF EXISTS kb_files_fileparts;
-"""
-    )
-    await conn.close()
-    await storage.initialize()
-    yield storage
-    await storage.finalize()
-    if Utility.STORAGE in MAIN:
-        del MAIN[Utility.STORAGE]

{nucliadb_utils-4.0.3.post577.dist-info → nucliadb_utils-4.0.3.post579.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb_utils-4.0.3.post577.dist-info → nucliadb_utils-4.0.3.post579.dist-info}/top_level.txt RENAMED Viewed

File without changes

{nucliadb_utils-4.0.3.post577.dist-info → nucliadb_utils-4.0.3.post579.dist-info}/zip-safe RENAMED Viewed

File without changes

nucliadb-utils 4.0.3.post577__py3-none-any.whl → 4.0.3.post579__py3-none-any.whl

nucliadb-utils 4.0.3.post577py3-none-any.whl → 4.0.3.post579py3-none-any.whl