PyPI - nucliadb - Versions diffs - 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

migrations/0023_backfill_pg_catalog.py +8 -4
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +8 -4
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +8 -23
nucliadb/common/cluster/rebalance.py +484 -112
nucliadb/common/cluster/rollover.py +36 -9
nucliadb/common/cluster/settings.py +4 -9
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +9 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +5 -34
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +129 -41
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +16 -23
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +82 -58
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +22 -5
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +10 -8
nucliadb/ingest/consumer/service.py +5 -30
nucliadb/ingest/consumer/shard_creator.py +16 -5
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +37 -49
nucliadb/ingest/fields/conversation.py +55 -9
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +89 -57
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +128 -113
nucliadb/ingest/orm/knowledgebox.py +91 -59
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +98 -153
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +82 -71
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +15 -114
nucliadb/ingest/settings.py +36 -15
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +23 -26
nucliadb/metrics_exporter.py +20 -6
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +4 -11
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +37 -9
nucliadb/reader/api/v1/learning_config.py +33 -14
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +3 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +15 -19
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +28 -8
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +33 -19
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -42
nucliadb/search/search/chat/ask.py +131 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +453 -32
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +49 -0
nucliadb/search/search/hydrator/fields.py +217 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +323 -0
nucliadb/search/search/hydrator/resources.py +60 -0
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +24 -7
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +44 -18
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -48
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +5 -6
nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
nucliadb/search/search/query_parser/parsers/common.py +21 -13
nucliadb/search/search/query_parser/parsers/find.py +6 -29
nucliadb/search/search/query_parser/parsers/graph.py +18 -28
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -56
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +6 -7
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +5 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +4 -10
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +15 -14
nucliadb/writer/api/v1/knowledgebox.py +18 -56
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +43 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +5 -7
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +15 -22
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
nucliadb/search/search/hydrator.py +0 -197
nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/common/cluster/rebalance.py CHANGED Viewed

@@ -18,162 +18,534 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
+import dataclasses
 import logging
+import math
+import random
+from typing import cast
+from grpc import StatusCode
+from grpc.aio import AioRpcError
 from nidx_protos import nodereader_pb2, noderesources_pb2
 from nucliadb.common import datamanagers, locking
 from nucliadb.common.cluster.utils import get_shard_manager
 from nucliadb.common.context import ApplicationContext
+from nucliadb.common.maindb.driver import Driver
+from nucliadb.common.maindb.pg import PGDriver
 from nucliadb.common.nidx import get_nidx_api_client, get_nidx_searcher_client
+from nucliadb_protos import writer_pb2
 from nucliadb_telemetry import errors
 from nucliadb_telemetry.logs import setup_logging
 from nucliadb_telemetry.utils import setup_telemetry
 from nucliadb_utils.fastapi.run import serve_metrics
 from .settings import settings
-from .utils import delete_resource_from_shard, index_resource_to_shard
+from .utils import delete_resource_from_shard, index_resource_to_shard, wait_for_nidx
 logger = logging.getLogger(__name__)
 REBALANCE_LOCK = "rebalance"
+MAX_MOVES_PER_SHARD = 100
+@dataclasses.dataclass
+class RebalanceShard:
+    id: str
+    nidx_id: str
+    paragraphs: int
+    active: bool
+    def to_dict(self):
+        return self.__dict__
+class Rebalancer:
+    def __init__(self, context: ApplicationContext, kbid: str):
+        self.context = context
+        self.kbid = kbid
+        self.kb_shards: writer_pb2.Shards | None = None
+    async def get_rebalance_shards(self, estimate: bool = False) -> list[RebalanceShard]:
+        """
+        Return the sorted list of shards by increasing paragraph count.
+        If estimate is True, it will fetch the paragraph count from nidx shard metadata, which is lighter
+        but deletions are not guaranteed to be reflected. Otherwise, it will get the paragraph counts
+        by querying nidx paragraph index for each shard.
+        """
+        result = []
+        self.kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=self.kbid)
+        if self.kb_shards is not None:
+            for idx, shard in enumerate(self.kb_shards.shards):
+                if estimate:
+                    shard_metadata = await get_shard_metadata(shard.nidx_shard_id)
+                    paragraphs = shard_metadata.paragraphs
+                else:
+                    paragraphs = await get_shard_paragraph_count(shard.nidx_shard_id)
+                result.append(
+                    RebalanceShard(
+                        id=shard.shard,
+                        nidx_id=shard.nidx_shard_id,
+                        paragraphs=paragraphs,
+                        active=(idx == self.kb_shards.actual),
+                    )
+                )
+        return list(sorted(result, key=lambda x: x.paragraphs))
+    async def move_paragraphs(
+        self, from_shard: RebalanceShard, to_shard: RebalanceShard, max_paragraphs: int
+    ) -> int:
+        """
+        Takes random resources from the source shard and tries to move at most max_paragraphs.
+        It stops moving paragraphs until the are no more resources to move.
+        """
+        moved_paragraphs = 0
+        resources_batch: list[str] = []
+        while moved_paragraphs < max_paragraphs:
+            if len(resources_batch) == 0:
+                resources_batch = await get_resources_from_shard(
+                    self.context.kv_driver, self.kbid, from_shard.id, n=100
+                )
+                if len(resources_batch) == 0:
+                    # No more resources to move or shard not found
+                    break
+            # Take a random resource to move
+            resource_id = random.choice(resources_batch)
+            assert self.kb_shards is not None
+            from_shard_obj = next(s for s in self.kb_shards.shards if s.shard == from_shard.id)
+            to_shard_obj = next(s for s in self.kb_shards.shards if s.shard == to_shard.id)
+            paragraphs_count = await get_resource_paragraphs_count(resource_id, from_shard.nidx_id)
+            moved = await move_resource_to_shard(
+                self.context, self.kbid, resource_id, from_shard_obj, to_shard_obj
+            )
+            if moved:
+                resources_batch.remove(resource_id)
+                moved_paragraphs += paragraphs_count
-async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
-    """
-    Ordered shard -> num paragraph by number of paragraphs
-    """
-    async with datamanagers.with_ro_transaction() as txn:
-        kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
-    if kb_shards is None:
-        return []
-    results = {}
-    for shard_meta in kb_shards.shards:
-        # Rebalance using node as source of truth. But it will rebalance nidx
-        shard_data: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
-            nodereader_pb2.GetShardRequest(
-                shard_id=noderesources_pb2.ShardId(id=shard_meta.nidx_shard_id)
-            )  # type: ignore
+        return moved_paragraphs
+    async def wait_for_indexing(self):
+        try:
+            self.context.nats_manager
+        except AssertionError:  # pragma: no cover
+            logger.warning(f"Nats manager not initialized. Cannot wait for indexing")
+            return
+        while True:
+            try:
+                await wait_for_nidx(self.context.nats_manager, max_wait_seconds=60, max_pending=1000)
+                return
+            except asyncio.TimeoutError:
+                logger.warning("Nidx is behind. Backing off rebalancing.", extra={"kbid": self.kbid})
+                await asyncio.sleep(30)
+    async def required(self) -> bool:
+        """
+        Return true if any shard needs rebalancing.
+        """
+        shards = await self.get_rebalance_shards(estimate=True)
+        return any(needs_split(shard) or needs_merge(shard, shards) for shard in shards)
+    async def rebalance_shards(self):
+        """
+        Iterate over shards until none of them need more rebalancing.
+        Will move excess of paragraphs to other shards (potentially creating new ones), and
+        merge small shards together when possible (potentially deleting empty ones.)
+        Merge chooses a <90% filled shard and fills it to almost 100%
+        Split chooses a >110% filled shard and reduces it to 100%
+        If the shard is between 90% and 110% full, nobody touches it
+        """
+        while True:
+            await self.wait_for_indexing()
+            shards = await self.get_rebalance_shards()
+            # Any shards to split?
+            shard_to_split = next((s for s in shards[::-1] if needs_split(s)), None)
+            if shard_to_split is not None:
+                await self.split_shard(shard_to_split, shards)
+                continue
+            # Any shards to merge?
+            shard_to_merge = next((s for s in shards if needs_merge(s, shards)), None)
+            if shard_to_merge is not None:
+                await self.merge_shard(shard_to_merge, shards)
+            else:
+                break
+    async def split_shard(self, shard_to_split: RebalanceShard, shards: list[RebalanceShard]):
+        logger.info(
+            "Splitting excess of paragraphs to other shards",
+            extra={
+                "kbid": self.kbid,
+                "shard": shard_to_split.to_dict(),
+            },
         )
-        results[shard_meta.shard] = shard_data.paragraphs
-    return [(shard, paragraphs) for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])]
+        # First off, calculate if the excess fits in the other shards or we need to add a new shard.
+        # Note that we don't filter out the active shard on purpose.
+        excess = shard_to_split.paragraphs - settings.max_shard_paragraphs
+        other_shards = [s for s in shards if s.id != shard_to_split.id]
+        other_shards_capacity = sum(
+            [max(0, (settings.max_shard_paragraphs - s.paragraphs)) for s in other_shards]
+        )
+        if excess > other_shards_capacity:
+            shards_to_add = math.ceil((excess - other_shards_capacity) / settings.max_shard_paragraphs)
+            logger.info(
+                "More shards needed",
+                extra={
+                    "kbid": self.kbid,
+                    "shards_to_add": shards_to_add,
+                    "all_shards": [s.to_dict() for s in shards],
+                },
+            )
+            # Add new shards where to rebalance the excess of paragraphs
+            async with (
+                locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=self.kbid)),
+                datamanagers.with_rw_transaction() as txn,
+            ):
+                kb_config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
+                prewarm = kb_config is not None and kb_config.prewarm_enabled
+                sm = get_shard_manager()
+                for _ in range(shards_to_add):
+                    await sm.create_shard_by_kbid(txn, self.kbid, prewarm_enabled=prewarm)
+                await txn.commit()
+            # Recalculate after having created shards, the active shard is a different one
+            shards = await self.get_rebalance_shards()
+        # Now, move resources to other shards as long as we are still over the max
+        for _ in range(MAX_MOVES_PER_SHARD):
+            shard_paragraphs = next(s.paragraphs for s in shards if s.id == shard_to_split.id)
+            excess = shard_paragraphs - settings.max_shard_paragraphs
+            if excess <= 0:
+                logger.info(
+                    "Shard rebalanced successfuly",
+                    extra={"kbid": self.kbid, "shard": shard_to_split.to_dict()},
+                )
+                break
-async def maybe_add_shard(kbid: str) -> None:
-    async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
-        async with datamanagers.with_ro_transaction() as txn:
-            kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
-        if kb_shards is None:
-            return
+            target_shard, target_capacity = get_target_shard(shards, shard_to_split, skip_active=False)
+            if target_shard is None:
+                logger.warning("No target shard found for splitting", extra={"kbid": self.kbid})
+                break
-        shard_paragraphs = await get_shards_paragraphs(kbid)
-        total_paragraphs = sum([c for _, c in shard_paragraphs])
+            moved_paragraphs = await self.move_paragraphs(
+                from_shard=shard_to_split,
+                to_shard=target_shard,
+                max_paragraphs=min(excess, target_capacity),
+            )
-        if (total_paragraphs / len(kb_shards.shards)) > (
-            settings.max_shard_paragraphs * 0.9  # 90% of the max
-        ):
-            # create new shard
-            async with datamanagers.with_transaction() as txn:
-                sm = get_shard_manager()
-                await sm.create_shard_by_kbid(txn, kbid)
-                await txn.commit()
+            # Update shard paragraph counts
+            shard_to_split.paragraphs -= moved_paragraphs
+            target_shard.paragraphs += moved_paragraphs
+            shards.sort(key=lambda x: x.paragraphs)
+            await self.wait_for_indexing()
-async def move_set_of_kb_resources(
-    context: ApplicationContext,
-    kbid: str,
-    from_shard_id: str,
-    to_shard_id: str,
-    count: int = 20,
-) -> None:
-    async with datamanagers.with_ro_transaction() as txn:
-        kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
-    if kb_shards is None:  # pragma: no cover
-        logger.warning("No shards found for kb. This should not happen.", extra={"kbid": kbid})
-        return
+    async def merge_shard(self, shard_to_merge: RebalanceShard, shards: list[RebalanceShard]):
+        logger.info(
+            "Merging shard",
+            extra={
+                "kbid": self.kbid,
+                "shard": shard_to_merge.to_dict(),
+            },
+        )
+        empty_shard = False
-    logger.info(
-        "Rebalancing kb shards",
-        extra={"kbid": kbid, "from": from_shard_id, "to": to_shard_id, "count": count},
-    )
+        for _ in range(MAX_MOVES_PER_SHARD):
+            resources_count = await count_resources_in_shard(
+                self.context.kv_driver, self.kbid, shard_to_merge.id
+            )
+            if resources_count == 0:
+                logger.info(
+                    "Shard is now empty",
+                    extra={
+                        "kbid": self.kbid,
+                        "shard": shard_to_merge.to_dict(),
+                    },
+                )
+                empty_shard = True
+                break
+            logger.info(
+                "Shard not yet empty",
+                extra={
+                    "kbid": self.kbid,
+                    "shard": shard_to_merge.to_dict(),
+                    "remaining": resources_count,
+                },
+            )
+            target_shard, target_capacity = get_target_shard(shards, shard_to_merge, skip_active=True)
+            if target_shard is None:
+                logger.warning(
+                    "No target shard could be found for merging. Moving on",
+                    extra={"kbid": self.kbid, "shard": shard_to_merge.to_dict()},
+                )
+                break
-    from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
-    to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
+            moved_paragraphs = await self.move_paragraphs(
+                from_shard=shard_to_merge,
+                to_shard=target_shard,
+                max_paragraphs=target_capacity,
+            )
+            # Update shard paragraph counts
+            shard_to_merge.paragraphs -= moved_paragraphs
+            target_shard.paragraphs += moved_paragraphs
+            shards.sort(key=lambda x: x.paragraphs)
+            await self.wait_for_indexing()
+        if empty_shard:
+            # If shard was emptied, delete it
+            async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=self.kbid)):
+                async with datamanagers.with_rw_transaction() as txn:
+                    kb_shards = await datamanagers.cluster.get_kb_shards(
+                        txn, kbid=self.kbid, for_update=True
+                    )
+                    if kb_shards is not None:
+                        logger.info(
+                            "Deleting empty shard",
+                            extra={
+                                "kbid": self.kbid,
+                                "shard_id": shard_to_merge.id,
+                                "nidx_shard_id": shard_to_merge.nidx_id,
+                            },
+                        )
+                        # Delete shards from kb shards in maindb
+                        to_delete, to_delete_idx = next(
+                            (s, idx)
+                            for idx, s in enumerate(kb_shards.shards)
+                            if s.shard == shard_to_merge.id
+                        )
+                        kb_shards.shards.remove(to_delete)
+                        if to_delete_idx <= kb_shards.actual:
+                            # Only decrement the actual pointer if we remove before the pointer.
+                            kb_shards.actual -= 1
+                        assert kb_shards.actual >= 0
+                        await datamanagers.cluster.update_kb_shards(
+                            txn, kbid=self.kbid, shards=kb_shards
+                        )
+                        await txn.commit()
+                # Delete shard from nidx
+                if to_delete:
+                    await get_nidx_api_client().DeleteShard(
+                        noderesources_pb2.ShardId(id=to_delete.nidx_shard_id)
+                    )
+async def get_resources_from_shard(driver: Driver, kbid: str, shard_id: str, n: int) -> list[str]:
+    driver = cast(PGDriver, driver)
+    async with driver._get_connection() as conn:
+        cur = conn.cursor("")
+        await cur.execute(
+            """
+            SELECT split_part(key, '/', 5) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s AND value = %s LIMIT %s;
+            """,
+            (f"/kbs/{kbid}/r/[^/]*/shard$", shard_id, n),
+        )
+        records = await cur.fetchall()
+        rids: list[str] = [r[0] for r in records]
+        return rids
+async def get_resource_paragraphs_count(resource_id: str, nidx_shard_id: str) -> int:
+    # Do a search on the fields (paragraph) index and return the number of paragraphs this resource has
+    try:
+        request = nodereader_pb2.SearchRequest(
+            shard=nidx_shard_id,
+            paragraph=True,
+            document=False,
+            result_per_page=0,
+            field_filter=nodereader_pb2.FilterExpression(
+                resource=nodereader_pb2.FilterExpression.ResourceFilter(resource_id=resource_id)
+            ),
+        )
+        search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
+        return search_response.paragraph.total
+    except AioRpcError as exc:  # pragma: no cover
+        if exc.code() == StatusCode.NOT_FOUND:
+            logger.warning(f"Shard not found in nidx", extra={"nidx_shard_id": nidx_shard_id})
+            return 0
+        raise
-    request = nodereader_pb2.SearchRequest(
-        shard=from_shard.nidx_shard_id,
-        paragraph=False,
-        document=True,
-        result_per_page=count,
+def get_target_shard(
+    shards: list[RebalanceShard], rebalanced_shard: RebalanceShard, skip_active: bool = True
+) -> tuple[RebalanceShard | None, int]:
+    """
+    Return the biggest shard with capacity (< 90% of the max paragraphs per shard).
+    """
+    target_shard = next(
+        reversed(
+            [
+                s
+                for s in shards
+                if s.id != rebalanced_shard.id
+                and s.paragraphs < settings.max_shard_paragraphs * 0.9
+                and (not skip_active or (skip_active and not s.active))
+            ]
+        ),
+        None,
     )
-    request.field_filter.field.field_type = "a"
-    request.field_filter.field.field_id = "title"
-    search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
+    if target_shard is None:  # pragma: no cover
+        return None, 0
+    # Aim to fill target shards up to 100% of max
+    capacity = int(max(0, settings.max_shard_paragraphs - target_shard.paragraphs))
+    return target_shard, capacity
+async def count_resources_in_shard(driver: Driver, kbid: str, shard_id: str) -> int:
+    driver = cast(PGDriver, driver)
+    async with driver._get_connection() as conn:
+        cur = conn.cursor("")
+        await cur.execute(
+            """
+            SELECT COUNT(*) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s AND value = %s;
+            """,
+            (f"/kbs/{kbid}/r/[^/]*/shard$", shard_id),
+        )
+        record = await cur.fetchone()
+        if record is None:  # pragma: no cover
+            return 0
+        return record[0]
+async def get_shard_paragraph_count(nidx_shard_id: str) -> int:
+    # Do a search on the fields (paragraph) index
+    try:
+        request = nodereader_pb2.SearchRequest(
+            shard=nidx_shard_id,
+            paragraph=True,
+            document=False,
+            result_per_page=0,
+        )
+        search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
+        return search_response.paragraph.total
+    except AioRpcError as exc:  # pragma: no cover
+        if exc.code() == StatusCode.NOT_FOUND:
+            logger.warning(f"Shard not found in nidx", extra={"nidx_shard_id": nidx_shard_id})
+            return 0
+        raise
+async def get_shard_metadata(nidx_shard_id: str) -> nodereader_pb2.Shard:
+    try:
+        shard_metadata: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
+            nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=nidx_shard_id))
+        )
+        return shard_metadata
+    except AioRpcError as exc:  # pragma: no cover
+        if exc.code() == StatusCode.NOT_FOUND:
+            logger.warning(f"Shard not found in nidx", extra={"nidx_shard_id": nidx_shard_id})
+            return nodereader_pb2.Shard()
+        raise
-    for result in search_response.document.results:
-        resource_id = result.uuid
+async def move_resource_to_shard(
+    context: ApplicationContext,
+    kbid: str,
+    resource_id: str,
+    from_shard: writer_pb2.ShardObject,
+    to_shard: writer_pb2.ShardObject,
+) -> bool:
+    indexed_to_new = False
+    deleted_from_old = False
+    try:
+        async with (
+            datamanagers.with_transaction() as txn,
+            locking.distributed_lock(
+                locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
+            ),
+        ):
+            found_shard_id = await datamanagers.resources.get_resource_shard_id(
+                txn, kbid=kbid, rid=resource_id, for_update=True
+            )
+            if found_shard_id is None:  # pragma: no cover
+                # resource deleted
+                return False
+            if found_shard_id != from_shard.shard:  # pragma: no cover
+                # resource could have already been moved
+                return False
+            await datamanagers.resources.set_resource_shard_id(
+                txn, kbid=kbid, rid=resource_id, shard=to_shard.shard
+            )
+            await index_resource_to_shard(context, kbid, resource_id, to_shard)
+            indexed_to_new = True
+            await delete_resource_from_shard(context, kbid, resource_id, from_shard)
+            deleted_from_old = True
+            await txn.commit()
+            return True
+    except Exception:
+        logger.exception(
+            "Failed to move resource",
+            extra={"kbid": kbid, "resource_id": resource_id},
+        )
+        # XXX Not ideal failure situation here. Try reverting the whole move even though it could be redundant
         try:
-            async with (
-                datamanagers.with_transaction() as txn,
-                locking.distributed_lock(
-                    locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
-                ),
-            ):
-                found_shard_id = await datamanagers.resources.get_resource_shard_id(
-                    txn, kbid=kbid, rid=resource_id, for_update=True
-                )
-                if found_shard_id is None:
-                    # resource deleted
-                    continue
-                if found_shard_id != from_shard_id:
-                    # resource could have already been moved
-                    continue
-                await datamanagers.resources.set_resource_shard_id(
-                    txn, kbid=kbid, rid=resource_id, shard=to_shard_id
-                )
-                await index_resource_to_shard(context, kbid, resource_id, to_shard)
-                await delete_resource_from_shard(context, kbid, resource_id, from_shard)
-                await txn.commit()
+            if indexed_to_new:
+                await delete_resource_from_shard(context, kbid, resource_id, to_shard)
+            if deleted_from_old:
+                await index_resource_to_shard(context, kbid, resource_id, from_shard)
         except Exception:
             logger.exception(
-                "Failed to move resource",
+                "Failed to revert move resource. Hopefully you never see this message.",
                 extra={"kbid": kbid, "resource_id": resource_id},
             )
-            # XXX Not ideal failure situation here. Try reverting the whole move even though it could be redundant
-            try:
-                await index_resource_to_shard(context, kbid, resource_id, from_shard)
-                await delete_resource_from_shard(context, kbid, resource_id, to_shard)
-            except Exception:
-                logger.exception(
-                    "Failed to revert move resource. Hopefully you never see this message.",
-                    extra={"kbid": kbid, "resource_id": resource_id},
-                )
+        return False
-async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
-    await maybe_add_shard(kbid)
+def needs_split(shard: RebalanceShard) -> bool:
+    """
+    Return true if the shard is more than 110% of the max.
+    Active shards are not considered for splitting: the shard creator subscriber will
+    eventually create a new shard, make it the active one and the previous one, if
+    too full, will be split.
+    """
+    return not shard.active and (shard.paragraphs > (settings.max_shard_paragraphs * 1.1))
-    shard_paragraphs = await get_shards_paragraphs(kbid)
-    rebalanced_shards = set()
-    while any(paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs):
-        # find the shard with the least/most paragraphs
-        smallest_shard = shard_paragraphs[0][0]
-        largest_shard = shard_paragraphs[-1][0]
-        assert smallest_shard != largest_shard
-        if smallest_shard in rebalanced_shards:
-            # XXX This is to prevent flapping data between shards on a single pass
-            # if we already rebalanced this shard, then we can't do anything else
-            break
+def needs_merge(shard: RebalanceShard, all_shards: list[RebalanceShard]) -> bool:
+    """
+    Returns true if a shard is less 75% full and there is enough capacity on the other shards to fit it.
-        await move_set_of_kb_resources(context, kbid, largest_shard, smallest_shard)
+    Active shards are not considered for merging. Shards that are more than 75% full are also skipped.
+    """
+    if shard.active:
+        return False
+    if shard.paragraphs > (settings.max_shard_paragraphs * 0.75):
+        return False
+    other_shards = [s for s in all_shards if s.id != shard.id and not s.active]
+    other_shards_capacity = sum(
+        [max(0, ((settings.max_shard_paragraphs * 0.9) - s.paragraphs)) for s in other_shards]
+    )
+    return shard.paragraphs < other_shards_capacity
-        rebalanced_shards.add(largest_shard)
-        shard_paragraphs = await get_shards_paragraphs(kbid)
+async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
+    rebalancer = Rebalancer(context, kbid)
+    try:
+        logger.info("Starting rebalance for kb", extra={"kbid": kbid})
+        if await rebalancer.required():
+            await rebalancer.rebalance_shards()
+        logger.info("Finished rebalance for kb", extra={"kbid": kbid})
+    except Exception as err:
+        logger.exception("Rebalance finished with error", extra={"kbid": kbid})
+        errors.capture_exception(err)
 async def run(context: ApplicationContext) -> None:
@@ -182,7 +554,7 @@ async def run(context: ApplicationContext) -> None:
             # get all kb ids
             async with datamanagers.with_ro_transaction() as txn:
                 kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
-            # go through each kb and see if shards need to be reduced in size
+            # go through each kb and see if shards need to be rebalanced
             for kbid in kbids:
                 async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
                     await rebalance_kb(context, kbid)

nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.7.2.post4874py3-none-any.whl → 6.10.0.post5705py3-none-any.whl