PyPI - nucliadb - Versions diffs - 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl - Mend

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

migrations/0028_extracted_vectors_reference.py +61 -0
migrations/0029_backfill_field_status.py +149 -0
migrations/0030_label_deduplication.py +60 -0
nucliadb/common/cluster/manager.py +41 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/settings.py +3 -0
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/counters.py +1 -0
nucliadb/common/datamanagers/fields.py +48 -7
nucliadb/common/datamanagers/vectorsets.py +11 -2
nucliadb/common/external_index_providers/base.py +2 -1
nucliadb/common/external_index_providers/pinecone.py +3 -5
nucliadb/common/ids.py +18 -4
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +76 -37
nucliadb/export_import/models.py +3 -3
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/fields/base.py +83 -21
nucliadb/ingest/orm/brain.py +55 -56
nucliadb/ingest/orm/broker_message.py +12 -2
nucliadb/ingest/orm/entities.py +6 -17
nucliadb/ingest/orm/knowledgebox.py +44 -22
nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
nucliadb/ingest/orm/processor/processor.py +5 -2
nucliadb/ingest/orm/resource.py +222 -413
nucliadb/ingest/processing.py +8 -2
nucliadb/ingest/serialize.py +77 -46
nucliadb/ingest/service/writer.py +2 -56
nucliadb/ingest/settings.py +1 -4
nucliadb/learning_proxy.py +6 -4
nucliadb/purge/__init__.py +102 -12
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/reader/api/models.py +3 -3
nucliadb/reader/api/v1/__init__.py +1 -0
nucliadb/reader/api/v1/download.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +3 -3
nucliadb/reader/api/v1/resource.py +23 -12
nucliadb/reader/api/v1/services.py +4 -4
nucliadb/reader/api/v1/vectorsets.py +48 -0
nucliadb/search/api/v1/ask.py +11 -1
nucliadb/search/api/v1/feedback.py +3 -3
nucliadb/search/api/v1/knowledgebox.py +8 -13
nucliadb/search/api/v1/search.py +3 -2
nucliadb/search/api/v1/suggest.py +0 -2
nucliadb/search/predict.py +6 -4
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/chat/ask.py +77 -13
nucliadb/search/search/chat/prompt.py +16 -5
nucliadb/search/search/chat/query.py +74 -34
nucliadb/search/search/exceptions.py +2 -7
nucliadb/search/search/find.py +9 -5
nucliadb/search/search/find_merge.py +10 -4
nucliadb/search/search/graph_strategy.py +884 -0
nucliadb/search/search/hydrator.py +6 -0
nucliadb/search/search/merge.py +79 -24
nucliadb/search/search/query.py +74 -245
nucliadb/search/search/query_parser/exceptions.py +11 -1
nucliadb/search/search/query_parser/fetcher.py +405 -0
nucliadb/search/search/query_parser/models.py +0 -3
nucliadb/search/search/query_parser/parser.py +22 -21
nucliadb/search/search/rerankers.py +1 -42
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/api_router.py +2 -14
nucliadb/standalone/settings.py +4 -0
nucliadb/train/generators/field_streaming.py +7 -3
nucliadb/train/lifecycle.py +3 -6
nucliadb/train/nodes.py +14 -12
nucliadb/train/resource.py +380 -0
nucliadb/writer/api/constants.py +20 -16
nucliadb/writer/api/v1/__init__.py +1 -0
nucliadb/writer/api/v1/export_import.py +1 -1
nucliadb/writer/api/v1/field.py +13 -7
nucliadb/writer/api/v1/knowledgebox.py +3 -46
nucliadb/writer/api/v1/resource.py +20 -13
nucliadb/writer/api/v1/services.py +10 -1
nucliadb/writer/api/v1/upload.py +61 -34
nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/resource/basic.py +9 -7
nucliadb/writer/resource/field.py +42 -9
nucliadb/writer/settings.py +2 -2
nucliadb/writer/tus/gcs.py +11 -10
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
nucliadb/standalone/introspect.py +0 -208
nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
/nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0

migrations/0028_extracted_vectors_reference.py ADDED Viewed

@@ -0,0 +1,61 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+"""Migration #28
+Add a key to each vectorset to know how to build the storage key for extracted vectors
+"""
+import logging
+from nucliadb.common import datamanagers
+from nucliadb.migrator.context import ExecutionContext
+from nucliadb_protos import knowledgebox_pb2
+logger = logging.getLogger(__name__)
+async def migrate(context: ExecutionContext) -> None: ...
+async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
+    async with datamanagers.with_rw_transaction() as txn:
+        vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
+        if len(vectorsets) == 0:  # pragma: nocover
+            # should never happen, everyone should have at least one
+            logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
+            return
+        elif len(vectorsets) == 1:
+            logger.info(f"Migrating KB with a single vectorset", extra={"kbid": kbid})
+            vectorset = vectorsets[0]
+            vectorset.storage_key_kind = knowledgebox_pb2.VectorSetConfig.StorageKeyKind.LEGACY
+            await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
+        else:
+            logger.info(f"Migrating KB with {len(vectorsets)} vectorsets", extra={"kbid": kbid})
+            for vectorset in vectorsets:
+                vectorset.storage_key_kind = (
+                    knowledgebox_pb2.VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX
+                )
+                await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
+        await txn.commit()

migrations/0029_backfill_field_status.py ADDED Viewed

@@ -0,0 +1,149 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+"""Migration #29
+Backfill field status (from error)
+"""
+import logging
+from typing import Optional
+from nucliadb.migrator.context import ExecutionContext
+from nucliadb_protos import resources_pb2, writer_pb2
+logger = logging.getLogger(__name__)
+async def migrate(context: ExecutionContext) -> None:
+    start: Optional[str] = ""
+    while True:
+        if start is None:
+            break
+        start = await do_batch(context, start)
+async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
+async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
+    logger.info(f"Running batch from {start}")
+    async with context.kv_driver.transaction(read_only=False) as txn:
+        async with txn.connection.cursor() as cur:  # type: ignore
+            # Retrieve a batch of fields
+            await cur.execute(
+                """
+                SELECT key FROM resources
+                WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*$'
+                AND key > %s
+                ORDER BY key
+                LIMIT 500""",
+                (start,),
+            )
+            records = await cur.fetchall()
+            if len(records) == 0:
+                return None
+            field_keys = [r[0] for r in records]
+            # Retrieve resources basic (to check status)
+            resource_keys = set(["/".join(f.split("/")[:5]) for f in field_keys])
+            await cur.execute(
+                """
+                SELECT key, value FROM resources
+                WHERE key = ANY (%s)
+                ORDER BY key
+                """,
+                (list(resource_keys),),
+            )
+            records = await cur.fetchall()
+            resources_basic = {}
+            for k, v in records:
+                row_basic = resources_pb2.Basic()
+                row_basic.ParseFromString(v)
+                resources_basic[k] = row_basic
+            # Retrieve field errors
+            await cur.execute(
+                """
+                SELECT key, value FROM resources
+                WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*/error$'
+                AND key > %s AND key <= %s
+                ORDER BY key
+                """,
+                (start, field_keys[-1] + "/error"),
+            )
+            records = await cur.fetchall()
+            errors = {}
+            for k, v in records:
+                row_error = writer_pb2.Error()
+                row_error.ParseFromString(v)
+                errors[k] = row_error
+            # Retrieve existing status keys
+            await cur.execute(
+                """
+                SELECT key FROM resources
+                WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*/status$'
+                AND key > %s AND key <= %s
+                ORDER BY key
+                """,
+                (start, field_keys[-1] + "/status"),
+            )
+            records = await cur.fetchall()
+            has_status = [r[0] for r in records]
+            set_batch = []
+            for field_key in field_keys:
+                if field_key + "/status" in has_status:
+                    # Already has status, skip
+                    continue
+                resource_key = "/".join(field_key.split("/")[:5])
+                basic = resources_basic.get(resource_key, None)
+                if basic is None:
+                    logger.warn(f"{field_key} resource has no basic, skipped")
+                    continue
+                status = writer_pb2.FieldStatus()
+                status.status = writer_pb2.FieldStatus.Status.PROCESSED
+                error = errors.get(field_key + "/error", None)
+                # We only copy errors if they come from data augmentation or if the resource is in error
+                # This way we ensure we do not set an error for resources that were previously not in error
+                # There is no way to do this 100% accurate since the /error key is only cleared on field deletion
+                if error:
+                    if (
+                        error.code == writer_pb2.Error.ErrorCode.DATAAUGMENTATION
+                        or basic.metadata.status == resources_pb2.Metadata.Status.ERROR
+                    ):
+                        field_error = writer_pb2.FieldError(
+                            source_error=error,
+                        )
+                        status.errors.append(field_error)
+                        status.status = writer_pb2.FieldStatus.Status.ERROR
+                set_batch.append((field_key + "/status", status.SerializeToString()))
+            # Write everything to the database in batch
+            async with cur.copy("COPY resources (key, value) FROM STDIN") as copy:
+                for row in set_batch:
+                    await copy.write_row(row)
+            await txn.commit()
+            return field_keys[-1]

migrations/0030_label_deduplication.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+"""Migration #30
+We want to support labels with the same title anymore. Run a deduplication for
+all labelsets
+"""
+import logging
+from nucliadb.common import datamanagers
+from nucliadb.migrator.context import ExecutionContext
+logger = logging.getLogger(__name__)
+async def migrate(context: ExecutionContext) -> None: ...
+async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
+    async with datamanagers.with_rw_transaction() as txn:
+        kb_labels = await datamanagers.labels.get_labels(txn, kbid=kbid)
+        changed = False
+        for labelset in kb_labels.labelset.values():
+            current_labels = labelset.labels
+            labelset.ClearField("labels")
+            deduplicator = set()
+            for label in current_labels:
+                label_id = label.title.lower()
+                if label_id not in deduplicator:
+                    deduplicator.add(label_id)
+                    labelset.labels.append(label)
+            changed = changed or (len(labelset.labels) < len(current_labels))
+        if changed:
+            await datamanagers.labels.set_labels(txn, kbid=kbid, labels=kb_labels)
+        await txn.commit()

nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl