PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/ingest/orm/resource.py CHANGED Viewed

@@ -22,8 +22,9 @@ from __future__ import annotations
 import asyncio
 import logging
 from collections import defaultdict
+from collections.abc import Sequence
 from concurrent.futures import ThreadPoolExecutor
-from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
+from typing import Any
 from nucliadb.common import datamanagers
 from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
@@ -68,13 +69,11 @@ from nucliadb_protos.resources_pb2 import Origin as PBOrigin
 from nucliadb_protos.resources_pb2 import Relations as PBRelations
 from nucliadb_protos.writer_pb2 import BrokerMessage
 from nucliadb_utils.storages.storage import Storage
-if TYPE_CHECKING:  # pragma: no cover
-    from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
+from nucliadb_utils.utilities import get_storage
 logger = logging.getLogger(__name__)
-KB_FIELDS: dict[int, Type] = {
+KB_FIELDS: dict[int, type] = {
     FieldType.TEXT: Text,
     FieldType.FILE: File,
     FieldType.LINK: Link,
@@ -104,40 +103,55 @@ class Resource:
         self,
         txn: Transaction,
         storage: Storage,
-        kb: KnowledgeBox,
+        kbid: str,
         uuid: str,
-        basic: Optional[PBBasic] = None,
+        basic: PBBasic | None = None,
         disable_vectors: bool = True,
     ):
         self.fields: dict[tuple[FieldType.ValueType, str], Field] = {}
         self.conversations: dict[int, PBConversation] = {}
-        self.relations: Optional[PBRelations] = None
-        self.all_fields_keys: Optional[list[tuple[FieldType.ValueType, str]]] = None
-        self.origin: Optional[PBOrigin] = None
-        self.extra: Optional[PBExtra] = None
-        self.security: Optional[utils_pb2.Security] = None
+        self.relations: PBRelations | None = None
+        self.all_fields_keys: list[tuple[FieldType.ValueType, str]] | None = None
+        self.origin: PBOrigin | None = None
+        self.extra: PBExtra | None = None
+        self.security: utils_pb2.Security | None = None
         self.modified: bool = False
         self._modified_extracted_text: list[FieldID] = []
         self.txn = txn
         self.storage = storage
-        self.kb = kb
+        self.kbid = kbid
         self.uuid = uuid
         self.basic = basic
         self.disable_vectors = disable_vectors
-        self._previous_status: Optional[Metadata.Status.ValueType] = None
-        self.user_relations: Optional[PBRelations] = None
+        self._previous_status: Metadata.Status.ValueType | None = None
+        self.user_relations: PBRelations | None = None
         self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
+    @classmethod
+    async def get(cls, txn: Transaction, kbid: str, rid: str) -> Resource | None:
+        basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
+        if basic is None:
+            return None
+        storage = await get_storage()
+        return cls(
+            txn=txn,
+            storage=storage,
+            kbid=kbid,
+            uuid=rid,
+            basic=basic,
+            disable_vectors=False,
+        )
     async def set_slug(self):
         basic = await self.get_basic()
-        new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
+        new_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
         await self.txn.set(new_key, self.uuid.encode())
     # Basic
-    async def get_basic(self) -> Optional[PBBasic]:
+    async def get_basic(self) -> PBBasic:
         if self.basic is None:
-            basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
+            basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=self.uuid)
             self.basic = basic if basic is not None else PBBasic()
         return self.basic
@@ -159,7 +173,7 @@ class Resource:
     async def set_basic(
         self,
         payload: PBBasic,
-        deleted_fields: Optional[list[FieldID]] = None,
+        deleted_fields: list[FieldID] | None = None,
     ):
         await self.get_basic()
@@ -212,49 +226,43 @@ class Resource:
         if deleted_fields is not None and len(deleted_fields) > 0:
             delete_basic_computedmetadata_classifications(self.basic, deleted_fields=deleted_fields)
-        await datamanagers.resources.set_basic(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
-        )
+        await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=self.uuid, basic=self.basic)
         self.modified = True
     # Origin
-    async def get_origin(self) -> Optional[PBOrigin]:
+    async def get_origin(self) -> PBOrigin | None:
         if self.origin is None:
-            origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
+            origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kbid, rid=self.uuid)
             self.origin = origin
         return self.origin
     async def set_origin(self, payload: PBOrigin):
-        await datamanagers.resources.set_origin(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
-        )
+        await datamanagers.resources.set_origin(self.txn, kbid=self.kbid, rid=self.uuid, origin=payload)
         self.modified = True
         self.origin = payload
     # Extra
-    async def get_extra(self) -> Optional[PBExtra]:
+    async def get_extra(self) -> PBExtra | None:
         if self.extra is None:
-            extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
+            extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kbid, rid=self.uuid)
             self.extra = extra
         return self.extra
     async def set_extra(self, payload: PBExtra):
-        await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
+        await datamanagers.resources.set_extra(self.txn, kbid=self.kbid, rid=self.uuid, extra=payload)
         self.modified = True
         self.extra = payload
     # Security
-    async def get_security(self) -> Optional[utils_pb2.Security]:
+    async def get_security(self) -> utils_pb2.Security | None:
         if self.security is None:
-            security = await datamanagers.resources.get_security(
-                self.txn, kbid=self.kb.kbid, rid=self.uuid
-            )
+            security = await datamanagers.resources.get_security(self.txn, kbid=self.kbid, rid=self.uuid)
             self.security = security
         return self.security
     async def set_security(self, payload: utils_pb2.Security) -> None:
         await datamanagers.resources.set_security(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, security=payload
+            self.txn, kbid=self.kbid, rid=self.uuid, security=payload
         )
         self.modified = True
         self.security = payload
@@ -262,7 +270,7 @@ class Resource:
     # Relations
     async def get_user_relations(self) -> PBRelations:
         if self.user_relations is None:
-            sf = self.storage.user_relations(self.kb.kbid, self.uuid)
+            sf = self.storage.user_relations(self.kbid, self.uuid)
             relations = await self.storage.download_pb(sf, PBRelations)
             if relations is None:
                 # Key not found = no relations
@@ -272,7 +280,7 @@ class Resource:
         return self.user_relations
     async def set_user_relations(self, payload: PBRelations):
-        sf = self.storage.user_relations(self.kb.kbid, self.uuid)
+        sf = self.storage.user_relations(self.kbid, self.uuid)
         await self.storage.upload_pb(sf, payload)
         self.modified = True
         self.user_relations = payload
@@ -354,25 +362,34 @@ class Resource:
         await field_obj.delete()
+    async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
+        """Return whether this resource has this field or not."""
+        all_fields_ids = await self.get_fields_ids()
+        for field_type, field_id in all_fields_ids:
+            if field_type == type and field_id == field:
+                return True
+        return False
     def has_field(self, type: FieldType.ValueType, field: str) -> bool:
+        # REVIEW: are we sure we don't want to actually check this?
         return (type, field) in self.fields
-    async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
+    async def get_all_field_ids(self, *, for_update: bool) -> PBAllFieldIDs | None:
         return await datamanagers.resources.get_all_field_ids(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
+            self.txn, kbid=self.kbid, rid=self.uuid, for_update=for_update
         )
     async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
         return await datamanagers.resources.set_all_field_ids(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, allfields=all_fields
+            self.txn, kbid=self.kbid, rid=self.uuid, allfields=all_fields
         )
     async def update_all_field_ids(
         self,
         *,
-        updated: Optional[list[FieldID]] = None,
-        deleted: Optional[list[FieldID]] = None,
-        errors: Optional[list[writer_pb2.Error]] = None,
+        updated: list[FieldID] | None = None,
+        deleted: list[FieldID] | None = None,
+        errors: list[writer_pb2.Error] | None = None,
     ):
         needs_update = False
         all_fields = await self.get_all_field_ids(for_update=True)
@@ -451,7 +468,7 @@ class Resource:
         # If this message comes from the processor (not a DA worker), we clear all previous errors
         # TODO: When generated_by is populated with DA tasks by processor, remove only related errors
-        from_processor = any((x.WhichOneof("generator") == "processor" for x in message.generated_by))
+        from_processor = any(x.WhichOneof("generator") == "processor" for x in message.generated_by)
         for (field_type, field), errors in errors_by_field.items():
             field_obj = await self.get_field(field, field_type, load=False)
@@ -471,7 +488,7 @@ class Resource:
             # We infer the status for processor messages
             if message.source == BrokerMessage.MessageSource.PROCESSOR:
                 if any(
-                    (e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors)
+                    e.source_error.severity == writer_pb2.Error.Severity.ERROR for e in status.errors
                 ):
                     status.status = writer_pb2.FieldStatus.Status.ERROR
                 else:
@@ -501,25 +518,21 @@ class Resource:
             return
         field_statuses = await datamanagers.fields.get_statuses(
-            self.txn, kbid=self.kb.kbid, rid=self.uuid, fields=field_ids.fields
+            self.txn, kbid=self.kbid, rid=self.uuid, fields=field_ids.fields
         )
         # If any field is processing -> PENDING
-        if any((f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses)):
+        if any(f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses):
             self.basic.metadata.status = PBMetadata.Status.PENDING
         # If we have any non-DA error -> ERROR
         elif any(
-            (
-                f.status == writer_pb2.FieldStatus.Status.ERROR
-                and any(
-                    (
-                        e.source_error.severity == writer_pb2.Error.Severity.ERROR
-                        and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
-                        for e in f.errors
-                    )
-                )
-                for f in field_statuses
+            f.status == writer_pb2.FieldStatus.Status.ERROR
+            and any(
+                e.source_error.severity == writer_pb2.Error.Severity.ERROR
+                and e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
+                for e in f.errors
             )
+            for f in field_statuses
         ):
             self.basic.metadata.status = PBMetadata.Status.ERROR
         # Otherwise (everything processed or we only have DA errors) -> PROCESSED
@@ -642,7 +655,7 @@ class Resource:
             FieldType.LINK,
             load=False,
         )
-        maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kb.kbid)
+        maybe_update_basic_thumbnail(self.basic, link_extracted_data.link_thumbnail, self.kbid)
         await field_link.set_link_extracted_data(link_extracted_data)
@@ -669,7 +682,7 @@ class Resource:
             return
         logger.info(
             "Updating resource title from link extracted data",
-            extra={"kbid": self.kb.kbid, "field": link_extracted_data.field, "rid": self.uuid},
+            extra={"kbid": self.kbid, "field": link_extracted_data.field, "rid": self.uuid},
         )
         title = link_extracted_data.title
         await self.update_resource_title(title)
@@ -711,7 +724,7 @@ class Resource:
         # uri can change after extraction
         await field_file.set_file_extracted_data(file_extracted_data)
         maybe_update_basic_icon(self.basic, file_extracted_data.icon)
-        maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kb.kbid)
+        maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail, self.kbid)
         self.modified = True
     async def _should_update_resource_title_from_file_metadata(self) -> bool:
@@ -733,7 +746,7 @@ class Resource:
         filenames = set()
         for (field_type, _), field_obj in fields.items():
             if field_type == FieldType.FILE:
-                field_value: Optional[FieldFile] = await field_obj.get_value()
+                field_value: FieldFile | None = await field_obj.get_value()
                 if field_value is not None:
                     if field_value.file.filename not in ("", None):
                         filenames.add(field_value.file.filename)
@@ -758,7 +771,7 @@ class Resource:
             fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
             logger.info(
                 "Updating resource title from file extracted data",
-                extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
+                extra={"kbid": self.kbid, "field": fid.full(), "new_title": fed.title},
             )
             await self.update_resource_title(fed.title)
             await self.unmark_title_for_reset()
@@ -776,9 +789,7 @@ class Resource:
         )
         await field_obj.set_field_metadata(field_metadata)
-        maybe_update_basic_thumbnail(
-            self.basic, field_metadata.metadata.metadata.thumbnail, self.kb.kbid
-        )
+        maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail, self.kbid)
         update_basic_computedmetadata_classifications(self.basic, field_metadata)
         self.modified = True
@@ -790,7 +801,7 @@ class Resource:
         await self.get_fields(force=True)
         vectorsets = {
             vectorset_id: vs
-            async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kb.kbid)
+            async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kbid)
         }
         for field_vectors in fields_vectors:
@@ -799,13 +810,13 @@ class Resource:
                 assert len(vectorsets) == 1, (
                     "Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
                 )
-                vectorset = list(vectorsets.values())[0]
+                vectorset = next(iter(vectorsets.values()))
             else:
                 if field_vectors.vectorset_id not in vectorsets:
                     logger.warning(
                         "Dropping extracted vectors for unknown vectorset",
-                        extra={"kbid": self.kb.kbid, "vectorset": field_vectors.vectorset_id},
+                        extra={"kbid": self.kbid, "vectorset": field_vectors.vectorset_id},
                     )
                     continue
@@ -916,7 +927,7 @@ def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
     return True
-def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
+def maybe_update_basic_icon(basic: PBBasic, mimetype: str | None) -> bool:
     if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
         # Icon already set or detected
         return False
@@ -935,7 +946,7 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
     return True
-def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile], kbid: str) -> bool:
+def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: CloudFile | None, kbid: str) -> bool:
     if basic.thumbnail or thumbnail is None:
         return False
     basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -972,7 +983,7 @@ def update_basic_languages(basic: Basic, languages: list[str]) -> bool:
     return updated
-def get_text_field_mimetype(bm: BrokerMessage) -> Optional[str]:
+def get_text_field_mimetype(bm: BrokerMessage) -> str | None:
     if len(bm.texts) == 0:
         return None
     text_format = next(iter(bm.texts.values())).format

nucliadb/ingest/orm/utils.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 import urllib.parse
-from typing import Sequence
+from collections.abc import Sequence
 from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Text
 from nucliadb_protos.resources_pb2 import (

nucliadb/ingest/partitions.py CHANGED Viewed

@@ -25,12 +25,17 @@ from nucliadb.ingest.settings import Settings
 def assign_partitions(settings: Settings):
+    """
+    This function dynamically assigns the partitions to the current ingest sts
+    replica based on its hostname, typically (ingest-0, ingest-1, etc).
+    """
     # partitions start from 1, instead of 0
     all_partitions = [str(part + 1) for part in range(settings.nuclia_partitions)]
     # get replica number and total replicas from environment
     logger.info(f"PARTITIONS: Total Replicas = {settings.total_replicas}")
     if settings.replica_number == -1:
+        # Get replica number from hostname
         hostname = os.environ.get("HOSTNAME")
         if hostname is not None:
             sts_values = hostname.split("-")
@@ -39,10 +44,16 @@ def assign_partitions(settings: Settings):
                     settings.replica_number = int(sts_values[-1])
                 except Exception:
                     logger.error(f"Could not extract replica number from hostname: {hostname}")
-                    pass
+            else:
+                logger.warning(f"Could not determine replica number from hostname: {hostname}")
+        else:
+            logger.warning(f"Could not determine replica number from hostname.")
         if settings.replica_number == -1:
             settings.replica_number = 0
+    else:
+        # We assume that replica numbers are set manually via env variables
+        pass
     logger.info(f"PARTITIONS: Replica Number = {settings.replica_number}")
     # calculate assigned partitions based on total replicas and own replica number

nucliadb/ingest/processing.py CHANGED Viewed

@@ -25,7 +25,7 @@ import uuid
 from collections import defaultdict
 from contextlib import AsyncExitStack
 from enum import Enum
-from typing import Any, Optional
+from typing import Any
 import aiohttp
 import backoff
@@ -132,19 +132,19 @@ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> Process
 class ProcessingEngine:
     def __init__(
         self,
-        nuclia_service_account: Optional[str] = None,
-        nuclia_zone: Optional[str] = None,
-        nuclia_public_url: Optional[str] = None,
-        nuclia_processing_cluster_url: Optional[str] = None,
-        onprem: Optional[bool] = False,
-        nuclia_jwt_key: Optional[str] = None,
+        nuclia_service_account: str | None = None,
+        nuclia_zone: str | None = None,
+        nuclia_public_url: str | None = None,
+        nuclia_processing_cluster_url: str | None = None,
+        onprem: bool | None = False,
+        nuclia_jwt_key: str | None = None,
         days_to_keep: int = 3,
         driver: FileBackendConfig = FileBackendConfig.GCS,
     ):
         self.nuclia_service_account = nuclia_service_account
         self.nuclia_zone = nuclia_zone
         if nuclia_public_url is not None:
-            self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
+            self.nuclia_public_url: str | None = nuclia_public_url.format(zone=nuclia_zone)
         else:
             self.nuclia_public_url = None
@@ -196,7 +196,7 @@ class ProcessingEngine:
         return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
     def generate_file_token_from_fieldfile(
-        self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
+        self, file: FieldFilePB, classif_labels: list[ClassificationLabel] | None = None
     ) -> str:
         if self.nuclia_jwt_key is None:
             raise AttributeError("Nuclia JWT key not set")
@@ -235,7 +235,7 @@ class ProcessingEngine:
     )
     @processing_observer.wrap({"type": "file_field_upload"})
     async def convert_filefield_to_str(
-        self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+        self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
     ) -> str:
         # Upload file without storing on Nuclia DB
         headers = {}
@@ -273,7 +273,7 @@ class ProcessingEngine:
         ).decode()
     def convert_external_filefield_to_str(
-        self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+        self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
     ) -> str:
         if self.nuclia_jwt_key is None:
             raise AttributeError("Nuclia JWT key not set")
@@ -313,7 +313,7 @@ class ProcessingEngine:
         self,
         file: FieldFilePB,
         storage: Storage,
-        classif_labels: Optional[list[ClassificationLabel]] = None,
+        classif_labels: list[ClassificationLabel] | None = None,
     ) -> str:
         """It's already an internal file that needs to be uploaded"""
         if self.onprem is False:
@@ -438,7 +438,7 @@ class ProcessingEngine:
             queue=QueueType(queue_type) if queue_type is not None else None,
         )
-    async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
+    async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
         """
         Delete a resource from processing. This prevents inflight resources from being processed
         and wasting resources.
@@ -479,7 +479,7 @@ class DummyProcessingEngine(ProcessingEngine):
         pass
     async def convert_filefield_to_str(
-        self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+        self, file: models.FileField, classif_labels: list[ClassificationLabel] | None = None
     ) -> str:
         self.calls.append([file])
         index = len(self.values["convert_filefield_to_str"])
@@ -487,7 +487,7 @@ class DummyProcessingEngine(ProcessingEngine):
         return f"convert_filefield_to_str,{index}"
     def convert_external_filefield_to_str(
-        self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+        self, file_field: models.FileField, classif_labels: list[ClassificationLabel] | None = None
     ) -> str:
         self.calls.append([file_field])
         index = len(self.values["convert_external_filefield_to_str"])
@@ -498,7 +498,7 @@ class DummyProcessingEngine(ProcessingEngine):
         self,
         file: FieldFilePB,
         storage: Storage,
-        classif_labels: Optional[list[ClassificationLabel]] = None,
+        classif_labels: list[ClassificationLabel] | None = None,
     ) -> str:
         self.calls.append([file, storage])
         index = len(self.values["convert_internal_filefield_to_str"])
@@ -516,5 +516,5 @@ class DummyProcessingEngine(ProcessingEngine):
         self.values["send_to_process"].append([item, partition])
         return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
-    async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
+    async def delete_from_processing(self, *, kbid: str, resource_id: str | None = None) -> None:
         self.calls.append([kbid, resource_id])

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl