PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

migrations/0023_backfill_pg_catalog.py CHANGED Viewed

@@ -27,11 +27,11 @@ Backfill the data into the PG catalog
 import logging
 from typing import cast
-from nucliadb.common import datamanagers
 from nucliadb.common.catalog import catalog_update, get_catalog
 from nucliadb.common.catalog.pg import PGCatalog
 from nucliadb.common.maindb.pg import PGDriver, PGTransaction
 from nucliadb.ingest.orm.index_message import get_resource_index_message
+from nucliadb.ingest.orm.resource import Resource
 from nucliadb.migrator.context import ExecutionContext
 logger = logging.getLogger(__name__)
@@ -73,7 +73,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
                 # Index each resource
                 for rid in resources_to_index:
                     rid = str(rid).replace("-", "")
-                    resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
+                    resource = await Resource.get(txn, kbid=kbid, rid=rid)
                     if resource is None:
                         logger.warning(f"Could not load resource {rid} for kbid {kbid}")
                         continue

migrations/0029_backfill_field_status.py CHANGED Viewed

@@ -24,7 +24,6 @@ Backfill field status (from error)
 """
 import logging
-from typing import Optional
 from nucliadb.migrator.context import ExecutionContext
 from nucliadb_protos import resources_pb2, writer_pb2
@@ -33,7 +32,7 @@ logger = logging.getLogger(__name__)
 async def migrate(context: ExecutionContext) -> None:
-    start: Optional[str] = ""
+    start: str | None = ""
     while True:
         if start is None:
             break
@@ -43,7 +42,7 @@ async def migrate(context: ExecutionContext) -> None:
 async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
-async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
+async def do_batch(context: ExecutionContext, start: str) -> str | None:
     logger.info(f"Running batch from {start}")
     async with context.kv_driver.rw_transaction() as txn:
         async with txn.connection.cursor() as cur:  # type: ignore
@@ -64,7 +63,7 @@ async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
             field_keys = [r[0] for r in records]
             # Retrieve resources basic (to check status)
-            resource_keys = set(["/".join(f.split("/")[:5]) for f in field_keys])
+            resource_keys = {"/".join(f.split("/")[:5]) for f in field_keys}
             await cur.execute(
                 """
                 SELECT key, value FROM resources

migrations/0032_remove_old_relations.py CHANGED Viewed

@@ -26,7 +26,6 @@ is stored in object storage.
 """
 import logging
-from typing import Optional
 from nucliadb.migrator.context import ExecutionContext
@@ -34,7 +33,7 @@ logger = logging.getLogger(__name__)
 async def migrate(context: ExecutionContext) -> None:
-    start: Optional[str] = ""
+    start: str | None = ""
     while True:
         if start is None:
             break
@@ -45,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
     pass
-async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
+async def do_batch(context: ExecutionContext, start: str) -> str | None:
     logger.info(f"Running batch from {start}")
     async with context.kv_driver.rw_transaction() as txn:
         async with txn.connection.cursor() as cur:  # type: ignore

migrations/0038_backfill_catalog_field_labels.py CHANGED Viewed

@@ -27,11 +27,11 @@ Backfill the catalog with labels from fields metadata
 import logging
 from typing import cast
-from nucliadb.common import datamanagers
 from nucliadb.common.catalog import catalog_update, get_catalog
 from nucliadb.common.catalog.pg import PGCatalog
 from nucliadb.common.maindb.pg import PGDriver, PGTransaction
 from nucliadb.ingest.orm.index_message import get_resource_index_message
+from nucliadb.ingest.orm.resource import Resource
 from nucliadb.migrator.context import ExecutionContext
 from nucliadb_protos import resources_pb2
@@ -82,7 +82,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
                 # Index each resource
                 for key in to_index:
                     rid = key.split("/")[4]
-                    resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
+                    resource = await Resource.get(txn, kbid=kbid, rid=rid)
                     if resource is None:
                         logger.warning(f"Could not load resource {rid} for kbid {kbid}")
                         continue

migrations/0039_backfill_converation_splits_metadata.py CHANGED Viewed

@@ -36,7 +36,7 @@ from nucliadb.ingest.fields.conversation import (
 from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
 from nucliadb.migrator.context import ExecutionContext
 from nucliadb_protos import resources_pb2
-from nucliadb_protos.resources_pb2 import SplitMetadata, SplitsMetadata
+from nucliadb_protos.resources_pb2 import SplitsMetadata
 from nucliadb_utils.storages.storage import Storage
 logger = logging.getLogger(__name__)
@@ -102,5 +102,5 @@ async def build_splits_metadata(
         if page is None:
             continue
         for message in page.messages:
-            splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
+            splits_metadata.metadata.get_or_create(message.ident)
     return splits_metadata

migrations/0041_reindex_conversations.py ADDED Viewed

@@ -0,0 +1,137 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+import logging
+import uuid
+from collections.abc import AsyncIterator
+from typing import cast
+from nucliadb.common import datamanagers
+from nucliadb.common.maindb.pg import PGTransaction
+from nucliadb.ingest.orm.index_message import get_resource_index_message
+from nucliadb.ingest.orm.resource import Resource
+from nucliadb.migrator.context import ExecutionContext
+from nucliadb_protos.writer_pb2 import ShardObject, Shards
+logger = logging.getLogger(__name__)
+async def migrate(context: ExecutionContext) -> None: ...
+async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
+    """
+    Reindex resources that have conversation fields
+    """
+    kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid, for_update=False)
+    if kb_shards is not None:
+        async for rid in iter_affected_resource_ids(context, kbid):
+            await reindex_resource(context, kbid, rid, kb_shards)
+    else:
+        logger.warning(
+            "Migration 41: KB shards not found, skipping reindexing",
+            extra={"kbid": kbid},
+        )
+async def reindex_resource(
+    context: ExecutionContext,
+    kbid: str,
+    rid: str,
+    kb_shards: Shards,
+) -> None:
+    """
+    Reindex a single resource
+    """
+    async with datamanagers.with_ro_transaction() as rs_txn:
+        # Fetch the resource
+        resource = await Resource.get(rs_txn, kbid=kbid, rid=rid)
+        if resource is None:
+            logger.warning(
+                "Migration 41: Resource not found, skipping reindexing",
+                extra={"kbid": kbid, "rid": rid},
+            )
+            return
+        # Get the shard for the resource
+        shard: ShardObject | None = None
+        shard_id = await datamanagers.resources.get_resource_shard_id(
+            rs_txn, kbid=kbid, rid=rid, for_update=False
+        )
+        if shard_id is not None:
+            shard = next((shard for shard in kb_shards.shards if shard.shard == shard_id), None)
+        if shard is None:
+            logger.warning(
+                "Migration 41: Shard not found for resource, skipping reindexing",
+                extra={"kbid": kbid, "rid": rid, "shard_id": shard_id},
+            )
+            return
+        # Create the index message and reindex the resource
+        index_message = await get_resource_index_message(resource, reindex=True)
+        await context.shard_manager.add_resource(
+            shard,
+            index_message,
+            0,
+            partition="0",
+            kb=kbid,
+            reindex_id=uuid.uuid4().hex,
+        )
+        logger.info(
+            "Migration 41: Resource reindexed",
+            extra={"kbid": kbid, "rid": rid},
+        )
+async def iter_affected_resource_ids(context: ExecutionContext, kbid: str) -> AsyncIterator[str]:
+    start = ""
+    while True:
+        keys_batch = await get_batch(context, kbid, start)
+        if keys_batch is None:
+            break
+        start = keys_batch[-1]
+        for key in keys_batch:
+            # The keys have the format /kbs/{kbid}/r/{rid}/f/c/{field_id}
+            rid = key.split("/")[4]
+            yield rid
+async def get_batch(context: ExecutionContext, kbid: str, start: str) -> list[str] | None:
+    """
+    Get a batch of resource keys that hold conversation fields for the given KB.
+    Starting after the given start key.
+    Returns None if no more keys are found.
+    """
+    batch_size = 100
+    async with context.kv_driver.rw_transaction() as txn:
+        txn = cast(PGTransaction, txn)
+        async with txn.connection.cursor() as cur:
+            await cur.execute(
+                """
+                SELECT key FROM resources
+                WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
+                AND key > %s
+                ORDER BY key
+                LIMIT %s""",
+                (kbid, start, batch_size),
+            )
+            rows = await cur.fetchall()
+            if len(rows) == 0:
+                return None
+            return [row[0] for row in rows]

migrations/pg/0010_shards_index.py ADDED Viewed

@@ -0,0 +1,34 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from nucliadb.common.maindb.pg import PGTransaction
+async def migrate(txn: PGTransaction) -> None:
+    # Concurrent index must be created outside of a transaction but psycopg automatically
+    # creates transactions. We temporarily disable this for building indexes.
+    await txn.connection.commit()
+    try:
+        await txn.connection.set_autocommit(True)
+        await txn.connection.execute(
+            "CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
+        )
+    finally:
+        await txn.connection.set_autocommit(False)

nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py RENAMED Viewed

@@ -18,11 +18,10 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
+from nucliadb.common.maindb.pg import PGTransaction
-from nucliadb.common import datamanagers
-async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
-    async with datamanagers.with_ro_transaction() as txn:
-        return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)
+async def migrate(txn: PGTransaction) -> None:
+    async with txn.connection.cursor() as cur:
+        await cur.execute("CREATE STATISTICS catalog_kbid_labels ON kbid, labels FROM catalog;")
+        await cur.execute("ANALYZE catalog;")

migrations/pg/0012_catalog_statistics_undo.py ADDED Viewed

@@ -0,0 +1,26 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from nucliadb.common.maindb.pg import PGTransaction
+async def migrate(txn: PGTransaction) -> None:
+    async with txn.connection.cursor() as cur:
+        await cur.execute("DROP STATISTICS catalog_kbid_labels;")

nucliadb/backups/create.py CHANGED Viewed

@@ -21,8 +21,8 @@ import asyncio
 import json
 import logging
 import tarfile
+from collections.abc import AsyncIterator
 from datetime import datetime, timezone
-from typing import AsyncIterator, Optional
 from nucliadb.backups.const import (
     BackupFinishedStream,
@@ -37,7 +37,6 @@ from nucliadb.export_import.utils import (
     download_binary,
     get_broker_message,
     get_cloud_files,
-    get_entities,
     get_labels,
     get_search_configurations,
     get_synonyms,
@@ -76,7 +75,6 @@ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
     """
     await backup_resources(context, kbid, backup_id)
     await backup_labels(context, kbid, backup_id)
-    await backup_entities(context, kbid, backup_id)
     await backup_synonyms(context, kbid, backup_id)
     await backup_search_configurations(context, kbid, backup_id)
     await notify_backup_completed(context, kbid, backup_id)
@@ -235,15 +233,6 @@ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
     )
-async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
-    entities = await get_entities(context, kbid)
-    await context.blob_storage.upload_object(
-        bucket=settings.backups_bucket,
-        key=StorageKeys.ENTITIES.format(backup_id=backup_id),
-        data=entities.SerializeToString(),
-    )
 async def backup_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
     synonyms = await get_synonyms(context, kbid)
     await context.blob_storage.upload_object(
@@ -266,9 +255,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
     )
-async def get_metadata(
-    context: ApplicationContext, kbid: str, backup_id: str
-) -> Optional[BackupMetadata]:
+async def get_metadata(context: ApplicationContext, kbid: str, backup_id: str) -> BackupMetadata | None:
     async with context.kv_driver.ro_transaction() as txn:
         metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
         if metadata_raw is None:

nucliadb/backups/restore.py CHANGED Viewed

@@ -24,7 +24,8 @@ import functools
 import json
 import logging
 import tarfile
-from typing import Any, AsyncIterator, Callable, Optional, Union
+from collections.abc import AsyncIterator, Callable
+from typing import Any
 from pydantic import TypeAdapter
@@ -35,7 +36,6 @@ from nucliadb.common.context import ApplicationContext
 from nucliadb.export_import.utils import (
     import_binary,
     restore_broker_message,
-    set_entities_groups,
     set_labels,
     set_search_configurations,
     set_synonyms,
@@ -74,7 +74,6 @@ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
     """
     await restore_resources(context, kbid, backup_id)
     await restore_labels(context, kbid, backup_id)
-    await restore_entities(context, kbid, backup_id)
     await restore_synonyms(context, kbid, backup_id)
     await restore_search_configurations(context, kbid, backup_id)
     await delete_last_restored(context, kbid, backup_id)
@@ -101,7 +100,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
         await set_last_restored(context, kbid, backup_id, key)
-async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> Optional[str]:
+async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> str | None:
     key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
     async with context.kv_driver.ro_transaction() as txn:
         raw = await txn.get(key)
@@ -191,7 +190,7 @@ class ResourceBackupReader:
         data = await self.read(tarinfo_size + padding_bytes)
         return data[:tarinfo_size]
-    async def read_item(self) -> Union[BrokerMessage, CloudFile, CloudFileBinary]:
+    async def read_item(self) -> BrokerMessage | CloudFile | CloudFileBinary:
         tarinfo = await self.read_tarinfo()
         if tarinfo.name.startswith("broker-message"):
             raw_bm = await self.read_data(tarinfo)
@@ -257,16 +256,6 @@ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str)
     await set_labels(context, kbid, labels)
-async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
-    raw = await context.blob_storage.downloadbytes(
-        bucket=settings.backups_bucket,
-        key=StorageKeys.ENTITIES.format(backup_id=backup_id),
-    )
-    entities = kb_pb2.EntitiesGroups()
-    entities.ParseFromString(raw.getvalue())
-    await set_entities_groups(context, kbid, entities)
 async def restore_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
     raw = await context.blob_storage.downloadbytes(
         bucket=settings.backups_bucket,

nucliadb/backups/tasks.py CHANGED Viewed

@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Awaitable, Callable
+from collections.abc import Awaitable, Callable
 from nucliadb.backups.const import BackupsNatsConfig
 from nucliadb.backups.create import backup_kb_task
@@ -38,6 +38,7 @@ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
         callback=backup_kb_task,
         msg_type=CreateBackupRequest,
         max_concurrent_messages=10,
+        max_retries=100,
     )
     return consumer
@@ -64,6 +65,7 @@ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
         callback=restore_kb_task,
         msg_type=RestoreBackupRequest,
         max_concurrent_messages=10,
+        max_retries=100,
     )
     return consumer
@@ -90,6 +92,7 @@ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
         callback=delete_backup_task,
         msg_type=DeleteBackupRequest,
         max_concurrent_messages=2,
+        max_retries=100,
     )
     return consumer

nucliadb/common/back_pressure/cache.py CHANGED Viewed

@@ -21,7 +21,6 @@ import contextlib
 import logging
 import threading
 from datetime import datetime, timezone
-from typing import Optional
 from cachetools import TTLCache
@@ -47,7 +46,7 @@ class BackPressureCache:
         self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
         self._lock = threading.Lock()
-    def get(self, key: str) -> Optional[BackPressureData]:
+    def get(self, key: str) -> BackPressureData | None:
         with self._lock:
             data = self._cache.get(key, None)
             if data is None:
@@ -72,7 +71,7 @@ def cached_back_pressure(cache_key: str):
     Context manager that handles the caching of the try again in time so that
     we don't recompute try again times if we have already applied back pressure.
     """
-    data: Optional[BackPressureData] = _cache.get(cache_key)
+    data: BackPressureData | None = _cache.get(cache_key)
     if data is not None:
         back_pressure_type = data.type
         RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})

nucliadb/common/back_pressure/materializer.py CHANGED Viewed

@@ -20,7 +20,6 @@
 import asyncio
 import logging
 import threading
-from typing import Optional
 from cachetools import TTLCache
 from fastapi import HTTPException
@@ -118,12 +117,6 @@ class BackPressureMaterializer:
                     extra={"kbid": kbid},
                 )
                 return 0
-            if pending > 0:
-                logger.info(
-                    f"Processing returned {pending} pending messages for KB",
-                    extra={"kbid": kbid},
-                )
             self.processing_pending_cache[kbid] = pending
             return pending
@@ -184,7 +177,7 @@ class BackPressureMaterializer:
                 pending=pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="indexing", try_after=try_after)
+            data = BackPressureData(type="indexing", try_after=try_after, pending=pending)
             raise BackPressureException(data)
     def check_ingest(self):
@@ -199,7 +192,7 @@ class BackPressureMaterializer:
                 pending=ingest_pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="ingest", try_after=try_after)
+            data = BackPressureData(type="ingest", try_after=try_after, pending=ingest_pending)
             raise BackPressureException(data)
     async def check_processing(self, kbid: str):
@@ -215,11 +208,11 @@ class BackPressureMaterializer:
                 pending=kb_pending,
                 max_wait=settings.max_wait_time,
             )
-            data = BackPressureData(type="processing", try_after=try_after)
+            data = BackPressureData(type="processing", try_after=try_after, pending=kb_pending)
             raise BackPressureException(data)
-MATERIALIZER: Optional[BackPressureMaterializer] = None
+MATERIALIZER: BackPressureMaterializer | None = None
 materializer_lock = threading.Lock()
@@ -268,7 +261,7 @@ def get_materializer() -> BackPressureMaterializer:
     return MATERIALIZER
-async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) -> None:
+async def maybe_back_pressure(kbid: str, resource_uuid: str | None = None) -> None:
     """
     This function does system checks to see if we need to put back pressure on writes.
     In that case, a HTTP 429 will be raised with the estimated time to try again.
@@ -278,7 +271,7 @@ async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) ->
     await back_pressure_checks(kbid, resource_uuid)
-async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
+async def back_pressure_checks(kbid: str, resource_uuid: str | None = None):
     """
     Will raise a 429 if back pressure is needed:
     - If the processing engine is behind.
@@ -299,6 +292,7 @@ async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
                 "resource_uuid": resource_uuid,
                 "try_after": exc.data.try_after,
                 "back_pressure_type": exc.data.type,
+                "pending": exc.data.pending,
             },
         )
         raise HTTPException(

nucliadb/common/back_pressure/settings.py CHANGED Viewed

@@ -29,30 +29,30 @@ class BackPressureSettings(BaseSettings):
     )
     indexing_rate: float = Field(
         default=10,
-        description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
     )
     ingest_rate: float = Field(
         default=4,
-        description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
     )
     processing_rate: float = Field(
         default=1,
-        description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",  # noqa
+        description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
     )
     max_indexing_pending: int = Field(
         default=1000,
-        description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",  # noqa
+        description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
         alias="back_pressure_max_indexing_pending",
     )
     max_ingest_pending: int = Field(
         # Disabled by default
         default=0,
-        description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",  # noqa
+        description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
         alias="back_pressure_max_ingest_pending",
     )
     max_processing_pending: int = Field(
         default=1000,
-        description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",  # noqa
+        description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
         alias="back_pressure_max_processing_pending",
     )
     indexing_check_interval: int = Field(

nucliadb/common/back_pressure/utils.py CHANGED Viewed

@@ -28,6 +28,7 @@ from nucliadb_utils.nats import NatsConnectionManager
 class BackPressureData:
     type: str
     try_after: datetime
+    pending: int = 0
 class BackPressureException(Exception):

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl