PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/common/maindb/pg.py CHANGED Viewed

@@ -21,8 +21,9 @@ from __future__ import annotations
 import asyncio
 import logging
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import Any, AsyncGenerator, Optional
+from typing import Any
 import backoff
 import psycopg
@@ -72,7 +73,7 @@ class DataLayer:
         self.connection = connection
         self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
-    async def get(self, key: str, select_for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
         with pg_observer({"type": "get"}):
             statement = "SELECT value FROM resources WHERE key = %s"
             if select_for_update:
@@ -116,7 +117,7 @@ class DataLayer:
             async with self.connection.cursor() as cur:
                 await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
-    async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
         with pg_observer({"type": "batch_get"}):
             async with self.connection.cursor() as cur:
                 statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
@@ -134,7 +135,7 @@ class DataLayer:
         prefix: str,
         limit: int = DEFAULT_SCAN_LIMIT,
         include_start: bool = True,
-    ) -> AsyncGenerator[str, None]:
+    ) -> AsyncGenerator[str]:
         query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
         args: list[Any] = [prefix + "%"]
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
     async def batch_get(self, keys: list[str], for_update: bool = True):
         return await self.data_layer.batch_get(keys, select_for_update=for_update)
-    async def get(self, key: str, for_update: bool = True) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = True) -> bytes | None:
         return await self.data_layer.get(key, select_for_update=for_update)
     async def set(self, key: str, value: bytes):
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
             return await DataLayer(conn).batch_get(keys, select_for_update=False)
     @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         async with self.driver._get_connection() as conn:
             return await DataLayer(conn).get(key, select_for_update=False)
@@ -330,7 +331,7 @@ class PGDriver(Driver):
             metric.set(value)
     @asynccontextmanager
-    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
+    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
         if read_only:
             yield ReadOnlyPGTransaction(self)
         else:
@@ -343,7 +344,7 @@ class PGDriver(Driver):
                         await txn.abort()
     @asynccontextmanager
-    async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection, None]:
+    async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
         timeout = self.acquire_timeout_ms / 1000
         # Manual retry loop since backoff.on_exception does not play well with async context managers
         retries = 0

nucliadb/common/nidx.py CHANGED Viewed

@@ -19,7 +19,6 @@
 #
 import os
-from typing import Optional, Union
 from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
 from nidx_protos.nodewriter_pb2 import (
@@ -54,7 +53,7 @@ class NidxUtility:
         pass
-def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
+def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
     config = {}
     if storage_settings.file_backend == FileBackendConfig.LOCAL:
         local_bucket = bucket or storage_settings.local_indexing_bucket
@@ -161,7 +160,7 @@ class NidxNatsIndexer:
     async def index(self, writer: IndexMessage) -> int:
         res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
         logger.info(
-            f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid}  seqid: {res.seq}"  # noqa
+            f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid}  seqid: {res.seq}"
         )
         return res.seq
@@ -185,7 +184,7 @@ class NidxGrpcIndexer:
 class NidxServiceUtility(NidxUtility):
     """Implements Nidx utility connecting to the network service"""
-    indexer: Union[NidxNatsIndexer, NidxGrpcIndexer]
+    indexer: NidxNatsIndexer | NidxGrpcIndexer
     def __init__(self, service_name: str):
         self.service_name = service_name

nucliadb/export_import/datamanager.py CHANGED Viewed

@@ -18,8 +18,9 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import json
+from collections.abc import AsyncGenerator
 from datetime import datetime, timezone
-from typing import AsyncGenerator, Type, Union, cast
+from typing import Type, cast
 from nucliadb.common.maindb.driver import Driver
 from nucliadb.export_import import logger
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
 STORAGE_EXPORT_KEY = "exports/{export_id}"
 STORAGE_IMPORT_KEY = "imports/{import_id}"
-Metadata = Union[ExportMetadata, ImportMetadata]
+Metadata = ExportMetadata | ImportMetadata
 class ExportImportDataManager:
@@ -59,7 +60,7 @@ class ExportImportDataManager:
         if data is None or data == b"":
             raise MetadataNotFound()
         decoded = data.decode("utf-8")
-        model_type: Union[Type[ExportMetadata], Type[ImportMetadata]]
+        model_type: Type[ExportMetadata] | Type[ImportMetadata]
         if type == "export":
             model_type = ExportMetadata
         elif type == "import":

nucliadb/export_import/exporter.py CHANGED Viewed

@@ -18,11 +18,12 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import AsyncGenerator, Optional
+from collections.abc import AsyncGenerator
 from nucliadb.common.context import ApplicationContext
 from nucliadb.export_import import logger
 from nucliadb.export_import.datamanager import ExportImportDataManager
+from nucliadb.export_import.exceptions import MetadataNotFound
 from nucliadb.export_import.models import (
     ExportedItemType,
     ExportMetadata,
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
     download_binary,
     get_broker_message,
     get_cloud_files,
-    get_entities,
     get_labels,
     get_learning_config,
     iter_kb_resource_uuids,
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
 async def export_kb(
-    context: ApplicationContext, kbid: str, metadata: Optional[ExportMetadata] = None
+    context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
 ) -> AsyncGenerator[bytes, None]:
     """Export the data of a knowledgebox to a stream of bytes.
@@ -63,9 +63,6 @@ async def export_kb(
     async for chunk in resources_iterator:
         yield chunk
-    async for chunk in export_entities(context, kbid):
-        yield chunk
     async for chunk in export_labels(context, kbid):
         yield chunk
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
     """
     kbid, export_id = msg.kbid, msg.id
     dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
-    metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
+    try:
+        metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
+    except MetadataNotFound:  # pragma: no cover
+        logger.error(
+            "Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
+        )
+        return
     iterator = export_kb(context, kbid, metadata)
     retry_handler = TaskRetryHandler("export", dm, metadata)
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
     yield bm_bytes
-async def export_entities(
-    context: ApplicationContext,
-    kbid: str,
-) -> AsyncGenerator[bytes, None]:
-    entities = await get_entities(context, kbid)
-    if len(entities.entities_groups) > 0:
-        data = entities.SerializeToString()
-        yield ExportedItemType.ENTITIES.encode("utf-8")
-        yield len(data).to_bytes(4, byteorder="big")
-        yield data
 async def export_labels(
     context: ApplicationContext,
     kbid: str,

nucliadb/export_import/importer.py CHANGED Viewed

@@ -17,11 +17,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import AsyncGenerator, Callable, Optional, cast
+from collections.abc import AsyncGenerator, Callable
+from typing import cast
 from nucliadb.common.context import ApplicationContext
 from nucliadb.export_import import logger
 from nucliadb.export_import.datamanager import ExportImportDataManager
+from nucliadb.export_import.exceptions import MetadataNotFound
 from nucliadb.export_import.models import (
     ExportedItemType,
     ImportMetadata,
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
     TaskRetryHandler,
     import_binary,
     restore_broker_message,
-    set_entities_groups,
     set_labels,
 )
 from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -46,7 +47,7 @@ async def import_kb(
     context: ApplicationContext,
     kbid: str,
     stream: AsyncGenerator[bytes, None],
-    metadata: Optional[ImportMetadata] = None,
+    metadata: ImportMetadata | None = None,
 ) -> None:
     """
     Imports exported data from a stream into a knowledgebox.
@@ -72,8 +73,8 @@ async def import_kb(
             await import_binary(context, kbid, cf, binary_generator)
         elif item_type == ExportedItemType.ENTITIES:
-            entities = cast(kb_pb2.EntitiesGroups, data)
-            await set_entities_groups(context, kbid, entities)
+            # This is not supported anymore, we ignore it if we find it in and old backup
+            pass
         elif item_type == ExportedItemType.LABELS:
             labels = cast(kb_pb2.Labels, data)
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
     """
     kbid, import_id = msg.kbid, msg.id
     dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
-    metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
+    try:
+        metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
+    except MetadataNotFound:  # pragma: no cover
+        logger.error(
+            "Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
+        )
+        return
     retry_handler = TaskRetryHandler("import", dm, metadata)

nucliadb/export_import/tasks.py CHANGED Viewed

@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
         callback=export_kb_to_blob_storage,
         msg_type=NatsTaskMessage,
         max_concurrent_messages=10,
+        max_retries=100,
     )
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
         callback=import_kb_from_blob_storage,
         msg_type=NatsTaskMessage,
         max_concurrent_messages=10,
+        max_retries=100,
     )

nucliadb/export_import/utils.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import functools
-from typing import AsyncGenerator, AsyncIterator, Callable, Optional
+from collections.abc import AsyncGenerator, AsyncIterator, Callable
 import backoff
 from google.protobuf.message import DecodeError as ProtobufDecodeError
@@ -35,6 +35,7 @@ from nucliadb.export_import.exceptions import (
 )
 from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
 from nucliadb.ingest.orm.broker_message import generate_broker_message
+from nucliadb.ingest.orm.resource import Resource
 from nucliadb_models.configuration import SearchConfiguration
 from nucliadb_models.export_import import Status
 from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -171,14 +172,6 @@ async def import_binary(
     )
-async def set_entities_groups(
-    context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
-) -> None:
-    async with datamanagers.with_transaction() as txn:
-        await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
-        await txn.commit()
 async def set_synonyms(context: ApplicationContext, kbid: str, synonyms: kb_pb2.Synonyms) -> None:
     async with datamanagers.with_transaction() as txn:
         await datamanagers.synonyms.set(txn, kbid=kbid, synonyms=synonyms)
@@ -207,9 +200,9 @@ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> Asyn
 async def get_broker_message(
     context: ApplicationContext, kbid: str, rid: str
-) -> Optional[writer_pb2.BrokerMessage]:
+) -> writer_pb2.BrokerMessage | None:
     async with datamanagers.with_ro_transaction() as txn:
-        resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
+        resource = await Resource.get(txn, kbid=kbid, rid=rid)
         if resource is None:
             return None
         resource.disable_vectors = False
@@ -284,11 +277,6 @@ async def download_binary(
     assert downloaded_bytes == cf.size, "Downloaded bytes do not match the expected size"
-async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
-    async with datamanagers.with_ro_transaction() as txn:
-        return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
 async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
     async with datamanagers.with_ro_transaction() as txn:
         return await datamanagers.labels.get_labels(txn, kbid=kbid)
@@ -434,7 +422,7 @@ class ExportStreamReader:
     async def maybe_read_learning_config(
         self,
-    ) -> tuple[Optional[learning_proxy.LearningConfiguration], bytes]:
+    ) -> tuple[learning_proxy.LearningConfiguration | None, bytes]:
         """
         Tries to read a learning config from the beginning of the stream.
         Returs the learning config if found. It also returns any leftover bytes that
@@ -533,7 +521,7 @@ class TaskRetryHandler:
 async def get_learning_config(
     kbid: str,
-) -> Optional[learning_proxy.LearningConfiguration]:
+) -> learning_proxy.LearningConfiguration | None:
     return await learning_proxy.get_configuration(kbid)

nucliadb/health.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import asyncio
 import logging
-from typing import Awaitable, Callable, Optional
+from collections.abc import Awaitable, Callable
 from grpc import aio
 from grpc_health.v1 import health, health_pb2, health_pb2_grpc
@@ -41,7 +41,7 @@ def nats_manager_healthy() -> bool:
 def pubsub_check() -> bool:
-    driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
+    driver: PubSubDriver | None = get_utility(Utility.PUBSUB)
     if driver is None:
         return True
     if isinstance(driver, NatsPubsub):

nucliadb/ingest/app.py CHANGED Viewed

@@ -19,7 +19,7 @@
 #
 import asyncio
 import importlib.metadata
-from typing import Awaitable, Callable
+from collections.abc import Awaitable, Callable
 from nucliadb import health
 from nucliadb.backups.tasks import initialize_consumers as initialize_backup_consumers
@@ -96,7 +96,7 @@ async def initialize_grpc():  # pragma: no cover
     finalizers = await initialize()
     grpc_finalizer = await start_grpc(SERVICE_NAME)
-    return [grpc_finalizer] + finalizers
+    return [grpc_finalizer, *finalizers]
 async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
@@ -114,14 +114,14 @@ async def main_consumer():  # pragma: no cover
     ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
-    await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown] + finalizers)
+    await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown, *finalizers])
 async def main_orm_grpc():  # pragma: no cover
     finalizers = await initialize()
     grpc_finalizer = await start_grpc(SERVICE_NAME)
     metrics_server = await serve_metrics()
-    await run_until_exit([grpc_finalizer, metrics_server.shutdown] + finalizers)
+    await run_until_exit([grpc_finalizer, metrics_server.shutdown, *finalizers])
 async def main_ingest_processed_consumer():  # pragma: no cover
@@ -134,7 +134,7 @@ async def main_ingest_processed_consumer():  # pragma: no cover
     consumer = await consumer_service.start_ingest_processed_consumer_v2(SERVICE_NAME)
     await run_until_exit(
-        [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
+        [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine, *finalizers]
     )
@@ -158,8 +158,8 @@ async def main_subscriber_workers():  # pragma: no cover
     backup_consumers_finalizers = await initialize_backup_consumers(context)
     await run_until_exit(
-        backup_consumers_finalizers
-        + [
+        [
+            *backup_consumers_finalizers,
             imports_consumer.finalize,
             exports_consumer.finalize,
             stop_ingest_utility,
@@ -169,8 +169,8 @@ async def main_subscriber_workers():  # pragma: no cover
             grpc_health_finalizer,
             metrics_server.shutdown,
             context.finalize,
+            *finalizers,
         ]
-        + finalizers
     )

nucliadb/ingest/consumer/consumer.py CHANGED Viewed

@@ -20,12 +20,10 @@
 import asyncio
 import logging
 import time
-from typing import Optional, Union
 import backoff
 import nats
 import nats.js.api
-import nats.js.errors
 from nats.aio.client import Msg
 from nats.js import JetStreamContext
@@ -74,8 +72,8 @@ class IngestConsumer:
         partition: str,
         storage: Storage,
         nats_connection_manager: NatsConnectionManager,
-        pubsub: Optional[PubSubDriver] = None,
-        lock: Optional[Union[asyncio.Lock, asyncio.Semaphore]] = None,
+        pubsub: PubSubDriver | None = None,
+        lock: asyncio.Lock | asyncio.Semaphore | None = None,
     ):
         self.driver = driver
         self.partition = partition
@@ -85,9 +83,9 @@ class IngestConsumer:
         self.lock = lock or asyncio.Lock()
         self.processor = Processor(driver, storage, pubsub, partition)
-        self.subscription: Optional[JetStreamContext.PullSubscription] = None
+        self.subscription: JetStreamContext.PullSubscription | None = None
-    async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
+    async def ack_message(self, msg: Msg, kbid: str | None = None):
         await msg.ack()
     async def initialize(self):
@@ -162,7 +160,7 @@ class IngestConsumer:
     async def subscription_worker(self, msg: Msg):
         context.clear_context()
-        kbid: Optional[str] = None
+        kbid: str | None = None
         subject = msg.subject
         reply = msg.reply
         seqid = int(reply.split(".")[5])
@@ -238,7 +236,7 @@ class IngestConsumer:
                 logger.info(
                     f"An error happend while processing a message from {message_source}. "
                     f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
-                    f"Check sentry for more details: {str(e)}"
+                    f"Check sentry for more details: {e!s}"
                 )
                 await self.ack_message(msg, kbid)
                 logger.info("Message acked because of deadletter", extra={"seqid": seqid})
@@ -250,7 +248,7 @@ class IngestConsumer:
                 logger.info(
                     f"An error happend while processing a message from {message_source}. "
                     f"This message has been dropped and won't be retried again"
-                    f"Check sentry for more details: {str(e)}"
+                    f"Check sentry for more details: {e!s}"
                 )
                 await self.ack_message(msg, kbid)
                 logger.info("Message acked because of drop", extra={"seqid": seqid})
@@ -260,7 +258,7 @@ class IngestConsumer:
                 logger.exception(
                     f"An error happend while processing a message from {message_source}. "
                     "Message has not been ACKd and will be retried. "
-                    f"Check sentry for more details: {str(e)}"
+                    f"Check sentry for more details: {e!s}"
                 )
                 await msg.nak()
                 logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})

nucliadb/ingest/consumer/pull.py CHANGED Viewed

@@ -21,7 +21,6 @@ import asyncio
 import base64
 import time
 from contextlib import contextmanager
-from typing import Optional
 from aiohttp.client_exceptions import ClientConnectorError
 from opentelemetry import trace
@@ -35,7 +34,6 @@ from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
 from nucliadb.common.http_clients.processing import (
     ProcessingHTTPClient,
     ProcessingPullMessageProgressUpdater,
-    get_nua_api_id,
 )
 from nucliadb.common.maindb.driver import Driver
 from nucliadb.ingest import SERVICE_NAME, logger, logger_activity
@@ -96,7 +94,7 @@ class PullV2Worker:
         driver: Driver,
         storage: Storage,
         pull_time_error_backoff: int,
-        pubsub: Optional[PubSubDriver] = None,
+        pubsub: PubSubDriver | None = None,
         pull_time_empty_backoff: float = 5.0,
         pull_api_timeout: int = 60,
     ):
@@ -142,12 +140,9 @@ class PullV2Worker:
         data = None
         if nuclia_settings.nuclia_service_account is not None:
             headers["X-STF-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
-            # parse jwt sub to get pull type id
-            try:
-                get_nua_api_id()
-            except Exception as exc:
+            if nuclia_settings.nuclia_service_account is None:
                 logger.exception("Could not read NUA API Key. Can not start pull worker")
-                raise ReallyStopPulling() from exc
+                raise ReallyStopPulling()
         ack_tokens = []
         async with ProcessingHTTPClient() as processing_http_client:

nucliadb/ingest/consumer/service.py CHANGED Viewed

@@ -19,8 +19,8 @@
 #
 import asyncio
 import sys
+from collections.abc import Awaitable, Callable
 from functools import partial
-from typing import Awaitable, Callable, Optional
 from nucliadb.common.maindb.utils import setup_driver
 from nucliadb.ingest import SERVICE_NAME, logger
@@ -55,7 +55,7 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
 async def start_ingest_consumers(
-    service_name: Optional[str] = None,
+    service_name: str | None = None,
 ) -> Callable[[], Awaitable[None]]:
     if transaction_settings.transaction_local:
         raise ConfigurationError("Can not start ingest consumers in local mode")
@@ -91,7 +91,7 @@ async def start_ingest_consumers(
 async def start_ingest_processed_consumer_v2(
-    service_name: Optional[str] = None,
+    service_name: str | None = None,
 ) -> Callable[[], Awaitable[None]]:
     """
     This is not meant to be deployed with a stateful set like the other consumers.

nucliadb/ingest/consumer/utils.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Callable, Coroutine
+from collections.abc import Callable, Coroutine
 class DelayedTaskHandler:

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl