PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/common/ids.py CHANGED Viewed

@@ -24,7 +24,6 @@ paragraphs... Avoiding spread of id construction and parsing everywhere
 """
 from dataclasses import dataclass
-from typing import Optional
 from nucliadb_models.common import FieldTypeName
 from nucliadb_protos.resources_pb2 import FieldType
@@ -47,6 +46,8 @@ FIELD_TYPE_NAME_TO_STR = {
     FieldTypeName.CONVERSATION: "c",
 }
+FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
 @dataclass
 class FieldId:
@@ -65,7 +66,7 @@ class FieldId:
     Examples:
-    >>> FieldId(rid="rid", type="u", key="/my-link")
+    >>> FieldId(rid="rid", type="u", key="my-link")
     FieldID("rid/u/my-link")
     >>> FieldId.from_string("rid/u/my-link")
     FieldID("rid/u/my-link")
@@ -75,32 +76,7 @@ class FieldId:
     type: str
     key: str
     # also knwon as `split`, this indicates a part of a field in, for example, conversations
-    subfield_id: Optional[str] = None
-    def __repr__(self) -> str:
-        return f"FieldId({self.full()})"
-    def short_without_subfield(self) -> str:
-        return f"/{self.type}/{self.key}"
-    def full(self) -> str:
-        if self.subfield_id is None:
-            return f"{self.rid}/{self.type}/{self.key}"
-        else:
-            return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
-    def __hash__(self) -> int:
-        return hash(self.full())
-    @property
-    def pb_type(self) -> FieldType.ValueType:
-        return FIELD_TYPE_STR_TO_PB[self.type]
-    @classmethod
-    def from_pb(
-        cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
-    ) -> "FieldId":
-        return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
+    subfield_id: str | None = None
     @classmethod
     def from_string(cls, value: str) -> "FieldId":
@@ -120,11 +96,11 @@ class FieldId:
         parts = value.split("/")
         if len(parts) == 3:
             rid, _type, key = parts
-            _type = cls.parse_field_type(_type)
+            _type = cls._parse_field_type(_type)
             return cls(rid=rid, type=_type, key=key)
         elif len(parts) == 4:
             rid, _type, key, subfield_id = parts
-            _type = cls.parse_field_type(_type)
+            _type = cls._parse_field_type(_type)
             return cls(
                 rid=rid,
                 type=_type,
@@ -135,7 +111,49 @@ class FieldId:
             raise ValueError(f"Invalid FieldId: {value}")
     @classmethod
-    def parse_field_type(cls, _type: str) -> str:
+    def from_pb(
+        cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: str | None = None
+    ) -> "FieldId":
+        return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
+    @property
+    def pb_type(self) -> FieldType.ValueType:
+        return FIELD_TYPE_STR_TO_PB[self.type]
+    def full(self) -> str:
+        if self.subfield_id is None:
+            return f"{self.rid}/{self.type}/{self.key}"
+        else:
+            return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
+    def full_without_subfield(self) -> str:
+        return f"{self.rid}/{self.type}/{self.key}"
+    def short_without_subfield(self) -> str:
+        return f"/{self.type}/{self.key}"
+    def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
+        """Generate a ParagraphId from the current field given its start and
+        end.
+        """
+        return ParagraphId(
+            field_id=self,
+            paragraph_start=paragraph_start,
+            paragraph_end=paragraph_end,
+        )
+    def __str__(self) -> str:
+        return self.full()
+    def __repr__(self) -> str:
+        return f"FieldId({self.full()})"
+    def __hash__(self) -> int:
+        return hash(self.full())
+    @staticmethod
+    def _parse_field_type(_type: str) -> str:
         if _type not in FIELD_TYPE_STR_TO_PB:
             # Try to parse the enum value
             # XXX: This is to support field types that are integer values of FieldType
@@ -157,19 +175,6 @@ class ParagraphId:
     paragraph_start: int
     paragraph_end: int
-    def __repr__(self) -> str:
-        return f"ParagraphId({self.full()})"
-    def full(self) -> str:
-        return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
-    def __hash__(self) -> int:
-        return hash(self.full())
-    @property
-    def rid(self) -> str:
-        return self.field_id.rid
     @classmethod
     def from_string(cls, value: str) -> "ParagraphId":
         parts = value.split("/")
@@ -192,6 +197,22 @@ class ParagraphId:
             paragraph_end=vid.vector_end,
         )
+    @property
+    def rid(self) -> str:
+        return self.field_id.rid
+    def full(self) -> str:
+        return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
+    def __str__(self) -> str:
+        return self.full()
+    def __repr__(self) -> str:
+        return f"ParagraphId({self.full()})"
+    def __hash__(self) -> int:
+        return hash(self.full())
 @dataclass
 class VectorId:
@@ -217,19 +238,6 @@ class VectorId:
     vector_start: int
     vector_end: int
-    def __repr__(self) -> str:
-        return f"VectorId({self.full()})"
-    def full(self) -> str:
-        return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
-    def __hash__(self) -> int:
-        return hash(self.full())
-    @property
-    def rid(self) -> str:
-        return self.field_id.rid
     @classmethod
     def from_string(cls, value: str) -> "VectorId":
         parts = value.split("/")
@@ -239,8 +247,24 @@ class VectorId:
         field_id = FieldId.from_string("/".join(parts[:-2]))
         return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
+    @property
+    def rid(self) -> str:
+        return self.field_id.rid
+    def full(self) -> str:
+        return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
+    def __str__(self) -> str:
+        return self.full()
+    def __repr__(self) -> str:
+        return f"VectorId({self.full()})"
+    def __hash__(self) -> int:
+        return hash(self.full())
-def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
+def extract_data_augmentation_id(generated_field_id: str) -> str | None:
     """Data augmentation generated fields have a strict id with the following
     format:
     `da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`

nucliadb/common/locking.py CHANGED Viewed

@@ -22,7 +22,6 @@ import logging
 import time
 import uuid
 from dataclasses import dataclass
-from typing import Optional
 import orjson
@@ -99,7 +98,7 @@ class _Lock:
         self.task = asyncio.create_task(self._refresh_task())
         return self
-    async def get_lock_data(self, txn: Transaction) -> Optional[LockValue]:
+    async def get_lock_data(self, txn: Transaction) -> LockValue | None:
         existing_data = await txn.get(self.key, for_update=True)
         if existing_data is None:
             return None

nucliadb/common/maindb/driver.py CHANGED Viewed

@@ -20,8 +20,9 @@
 from __future__ import annotations
 import asyncio
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator, Optional
+from typing import ClassVar
 DEFAULT_SCAN_LIMIT = -1
 DEFAULT_BATCH_SCAN_LIMIT = 500
@@ -37,10 +38,10 @@ class Transaction:
     async def commit(self):
         raise NotImplementedError()
-    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
         raise NotImplementedError()
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         raise NotImplementedError()
     async def set(self, key: str, value: bytes):
@@ -57,7 +58,7 @@ class Transaction:
     def keys(
         self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
-    ) -> AsyncGenerator[str, None]:
+    ) -> AsyncGenerator[str]:
         raise NotImplementedError()
     async def count(self, match: str) -> int:
@@ -66,7 +67,7 @@ class Transaction:
 class Driver:
     initialized = False
-    _abort_tasks: list[asyncio.Task] = []
+    _abort_tasks: ClassVar[list[asyncio.Task]] = []
     async def initialize(self):
         raise NotImplementedError()
@@ -81,15 +82,15 @@ class Driver:
                     pass
     @asynccontextmanager
-    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
+    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
         yield Transaction()
     @asynccontextmanager
-    async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
+    async def ro_transaction(self) -> AsyncGenerator[Transaction]:
         async with self._transaction(read_only=True) as txn:
             yield txn
     @asynccontextmanager
-    async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
+    async def rw_transaction(self) -> AsyncGenerator[Transaction]:
         async with self._transaction(read_only=False) as txn:
             yield txn

nucliadb/common/maindb/local.py CHANGED Viewed

@@ -19,8 +19,8 @@
 #
 import glob
 import os
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator, Optional
 from nucliadb.common.maindb.driver import (
     DEFAULT_BATCH_SCAN_LIMIT,
@@ -78,7 +78,7 @@ class LocalTransaction(Transaction):
             # Deleting a key that does not exist
             pass
-    async def read(self, key: str) -> Optional[bytes]:
+    async def read(self, key: str) -> bytes | None:
         try:
             async with aiofiles.open(self.compute_path(key), "rb") as resp:
                 return await resp.read()
@@ -106,8 +106,8 @@ class LocalTransaction(Transaction):
         self.clean()
         self.open = False
-    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
-        results: list[Optional[bytes]] = []
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
+        results: list[bytes | None] = []
         for key in keys:
             obj = await self.get(key)
             if obj:
@@ -125,7 +125,7 @@ class LocalTransaction(Transaction):
         return results
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         if key in self.deleted_keys:
             raise KeyError(f"Not found {key}")

nucliadb/common/maindb/pg.py CHANGED Viewed

@@ -21,8 +21,9 @@ from __future__ import annotations
 import asyncio
 import logging
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from typing import Any, AsyncGenerator, Optional
+from typing import Any
 import backoff
 import psycopg
@@ -72,7 +73,7 @@ class DataLayer:
         self.connection = connection
         self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
-    async def get(self, key: str, select_for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
         with pg_observer({"type": "get"}):
             statement = "SELECT value FROM resources WHERE key = %s"
             if select_for_update:
@@ -116,7 +117,7 @@ class DataLayer:
             async with self.connection.cursor() as cur:
                 await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
-    async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
         with pg_observer({"type": "batch_get"}):
             async with self.connection.cursor() as cur:
                 statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
@@ -134,7 +135,7 @@ class DataLayer:
         prefix: str,
         limit: int = DEFAULT_SCAN_LIMIT,
         include_start: bool = True,
-    ) -> AsyncGenerator[str, None]:
+    ) -> AsyncGenerator[str]:
         query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
         args: list[Any] = [prefix + "%"]
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
     async def batch_get(self, keys: list[str], for_update: bool = True):
         return await self.data_layer.batch_get(keys, select_for_update=for_update)
-    async def get(self, key: str, for_update: bool = True) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = True) -> bytes | None:
         return await self.data_layer.get(key, select_for_update=for_update)
     async def set(self, key: str, value: bytes):
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
             return await DataLayer(conn).batch_get(keys, select_for_update=False)
     @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
-    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> bytes | None:
         async with self.driver._get_connection() as conn:
             return await DataLayer(conn).get(key, select_for_update=False)
@@ -330,7 +331,7 @@ class PGDriver(Driver):
             metric.set(value)
     @asynccontextmanager
-    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
+    async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
         if read_only:
             yield ReadOnlyPGTransaction(self)
         else:
@@ -343,7 +344,7 @@ class PGDriver(Driver):
                         await txn.abort()
     @asynccontextmanager
-    async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection, None]:
+    async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
         timeout = self.acquire_timeout_ms / 1000
         # Manual retry loop since backoff.on_exception does not play well with async context managers
         retries = 0

nucliadb/common/nidx.py CHANGED Viewed

@@ -19,7 +19,6 @@
 #
 import os
-from typing import Optional, Union
 from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
 from nidx_protos.nodewriter_pb2 import (
@@ -54,7 +53,7 @@ class NidxUtility:
         pass
-def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
+def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
     config = {}
     if storage_settings.file_backend == FileBackendConfig.LOCAL:
         local_bucket = bucket or storage_settings.local_indexing_bucket
@@ -82,6 +81,24 @@ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
         config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
         if storage_settings.s3_endpoint:
             config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
+    elif storage_settings.file_backend == FileBackendConfig.AZURE:
+        if storage_settings.azure_account_url is None:
+            raise ValueError("Azure account is required")
+        config[f"{prefix}__OBJECT_STORE"] = "azure"
+        url = storage_settings.azure_account_url
+        container = bucket or extended_storage_settings.azure_indexing_bucket
+        if container:
+            url += f"/{container}"
+        config[f"{prefix}__CONTAINER_URL"] = url
+        if storage_settings.azure_connection_string:
+            params = {
+                p.split("=", 1)[0]: p.split("=", 1)[1]
+                for p in storage_settings.azure_connection_string.split(";")
+            }
+            if "AccountKey" in params:
+                config[f"{prefix}__ACCOUNT_KEY"] = params["AccountKey"]
+            if "BlobEndpoint" in params:
+                config[f"{prefix}__ENDPOINT"] = params["BlobEndpoint"]
     return config
@@ -143,7 +160,7 @@ class NidxNatsIndexer:
     async def index(self, writer: IndexMessage) -> int:
         res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
         logger.info(
-            f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid}  seqid: {res.seq}"  # noqa
+            f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid}  seqid: {res.seq}"
         )
         return res.seq
@@ -167,7 +184,7 @@ class NidxGrpcIndexer:
 class NidxServiceUtility(NidxUtility):
     """Implements Nidx utility connecting to the network service"""
-    indexer: Union[NidxNatsIndexer, NidxGrpcIndexer]
+    indexer: NidxNatsIndexer | NidxGrpcIndexer
     def __init__(self, service_name: str):
         self.service_name = service_name
@@ -198,7 +215,7 @@ class NidxServiceUtility(NidxUtility):
         return await self.indexer.index(writer)
-async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> Optional[NidxUtility]:
+async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> NidxUtility:
     nidx = get_utility(Utility.NIDX)
     if nidx:
         return nidx

nucliadb/common/vector_index_config.py CHANGED Viewed

@@ -26,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2 as Nucliadb
 def nucliadb_vector_type_to_nidx(nucliadb: Nucliadb.VectorType.ValueType) -> Nidx.VectorType.ValueType:
     if nucliadb == Nucliadb.DENSE_F32:
         return Nidx.DENSE_F32
-    else:  # pragma: nocover
+    else:  # pragma: no cover
         raise Exception("Unknown vector type")

nucliadb/export_import/datamanager.py CHANGED Viewed

@@ -18,8 +18,9 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import json
+from collections.abc import AsyncGenerator
 from datetime import datetime, timezone
-from typing import AsyncGenerator, Type, Union, cast
+from typing import Type, cast
 from nucliadb.common.maindb.driver import Driver
 from nucliadb.export_import import logger
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
 STORAGE_EXPORT_KEY = "exports/{export_id}"
 STORAGE_IMPORT_KEY = "imports/{import_id}"
-Metadata = Union[ExportMetadata, ImportMetadata]
+Metadata = ExportMetadata | ImportMetadata
 class ExportImportDataManager:
@@ -59,7 +60,7 @@ class ExportImportDataManager:
         if data is None or data == b"":
             raise MetadataNotFound()
         decoded = data.decode("utf-8")
-        model_type: Union[Type[ExportMetadata], Type[ImportMetadata]]
+        model_type: Type[ExportMetadata] | Type[ImportMetadata]
         if type == "export":
             model_type = ExportMetadata
         elif type == "import":

nucliadb/export_import/exporter.py CHANGED Viewed

@@ -18,11 +18,12 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import AsyncGenerator, Optional
+from collections.abc import AsyncGenerator
 from nucliadb.common.context import ApplicationContext
 from nucliadb.export_import import logger
 from nucliadb.export_import.datamanager import ExportImportDataManager
+from nucliadb.export_import.exceptions import MetadataNotFound
 from nucliadb.export_import.models import (
     ExportedItemType,
     ExportMetadata,
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
     download_binary,
     get_broker_message,
     get_cloud_files,
-    get_entities,
     get_labels,
     get_learning_config,
     iter_kb_resource_uuids,
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
 async def export_kb(
-    context: ApplicationContext, kbid: str, metadata: Optional[ExportMetadata] = None
+    context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
 ) -> AsyncGenerator[bytes, None]:
     """Export the data of a knowledgebox to a stream of bytes.
@@ -63,9 +63,6 @@ async def export_kb(
     async for chunk in resources_iterator:
         yield chunk
-    async for chunk in export_entities(context, kbid):
-        yield chunk
     async for chunk in export_labels(context, kbid):
         yield chunk
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
     """
     kbid, export_id = msg.kbid, msg.id
     dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
-    metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
+    try:
+        metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
+    except MetadataNotFound:  # pragma: no cover
+        logger.error(
+            "Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
+        )
+        return
     iterator = export_kb(context, kbid, metadata)
     retry_handler = TaskRetryHandler("export", dm, metadata)
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
     yield bm_bytes
-async def export_entities(
-    context: ApplicationContext,
-    kbid: str,
-) -> AsyncGenerator[bytes, None]:
-    entities = await get_entities(context, kbid)
-    if len(entities.entities_groups) > 0:
-        data = entities.SerializeToString()
-        yield ExportedItemType.ENTITIES.encode("utf-8")
-        yield len(data).to_bytes(4, byteorder="big")
-        yield data
 async def export_labels(
     context: ApplicationContext,
     kbid: str,

nucliadb/export_import/importer.py CHANGED Viewed

@@ -17,11 +17,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import AsyncGenerator, Callable, Optional, cast
+from collections.abc import AsyncGenerator, Callable
+from typing import cast
 from nucliadb.common.context import ApplicationContext
 from nucliadb.export_import import logger
 from nucliadb.export_import.datamanager import ExportImportDataManager
+from nucliadb.export_import.exceptions import MetadataNotFound
 from nucliadb.export_import.models import (
     ExportedItemType,
     ImportMetadata,
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
     TaskRetryHandler,
     import_binary,
     restore_broker_message,
-    set_entities_groups,
     set_labels,
 )
 from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -46,7 +47,7 @@ async def import_kb(
     context: ApplicationContext,
     kbid: str,
     stream: AsyncGenerator[bytes, None],
-    metadata: Optional[ImportMetadata] = None,
+    metadata: ImportMetadata | None = None,
 ) -> None:
     """
     Imports exported data from a stream into a knowledgebox.
@@ -72,8 +73,8 @@ async def import_kb(
             await import_binary(context, kbid, cf, binary_generator)
         elif item_type == ExportedItemType.ENTITIES:
-            entities = cast(kb_pb2.EntitiesGroups, data)
-            await set_entities_groups(context, kbid, entities)
+            # This is not supported anymore, we ignore it if we find it in and old backup
+            pass
         elif item_type == ExportedItemType.LABELS:
             labels = cast(kb_pb2.Labels, data)
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
     """
     kbid, import_id = msg.kbid, msg.id
     dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
-    metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
+    try:
+        metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
+    except MetadataNotFound:  # pragma: no cover
+        logger.error(
+            "Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
+        )
+        return
     retry_handler = TaskRetryHandler("import", dm, metadata)

nucliadb/export_import/tasks.py CHANGED Viewed

@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
         callback=export_kb_to_blob_storage,
         msg_type=NatsTaskMessage,
         max_concurrent_messages=10,
+        max_retries=100,
     )
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
         callback=import_kb_from_blob_storage,
         msg_type=NatsTaskMessage,
         max_concurrent_messages=10,
+        max_retries=100,
     )

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl