PyPI - nucliadb - Versions diffs - 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl - Mend

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

migrations/0028_extracted_vectors_reference.py +61 -0
migrations/0029_backfill_field_status.py +149 -0
migrations/0030_label_deduplication.py +60 -0
nucliadb/common/cluster/manager.py +41 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/settings.py +3 -0
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/counters.py +1 -0
nucliadb/common/datamanagers/fields.py +48 -7
nucliadb/common/datamanagers/vectorsets.py +11 -2
nucliadb/common/external_index_providers/base.py +2 -1
nucliadb/common/external_index_providers/pinecone.py +3 -5
nucliadb/common/ids.py +18 -4
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +76 -37
nucliadb/export_import/models.py +3 -3
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/fields/base.py +83 -21
nucliadb/ingest/orm/brain.py +55 -56
nucliadb/ingest/orm/broker_message.py +12 -2
nucliadb/ingest/orm/entities.py +6 -17
nucliadb/ingest/orm/knowledgebox.py +44 -22
nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
nucliadb/ingest/orm/processor/processor.py +5 -2
nucliadb/ingest/orm/resource.py +222 -413
nucliadb/ingest/processing.py +8 -2
nucliadb/ingest/serialize.py +77 -46
nucliadb/ingest/service/writer.py +2 -56
nucliadb/ingest/settings.py +1 -4
nucliadb/learning_proxy.py +6 -4
nucliadb/purge/__init__.py +102 -12
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/reader/api/models.py +3 -3
nucliadb/reader/api/v1/__init__.py +1 -0
nucliadb/reader/api/v1/download.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +3 -3
nucliadb/reader/api/v1/resource.py +23 -12
nucliadb/reader/api/v1/services.py +4 -4
nucliadb/reader/api/v1/vectorsets.py +48 -0
nucliadb/search/api/v1/ask.py +11 -1
nucliadb/search/api/v1/feedback.py +3 -3
nucliadb/search/api/v1/knowledgebox.py +8 -13
nucliadb/search/api/v1/search.py +3 -2
nucliadb/search/api/v1/suggest.py +0 -2
nucliadb/search/predict.py +6 -4
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/chat/ask.py +77 -13
nucliadb/search/search/chat/prompt.py +16 -5
nucliadb/search/search/chat/query.py +74 -34
nucliadb/search/search/exceptions.py +2 -7
nucliadb/search/search/find.py +9 -5
nucliadb/search/search/find_merge.py +10 -4
nucliadb/search/search/graph_strategy.py +884 -0
nucliadb/search/search/hydrator.py +6 -0
nucliadb/search/search/merge.py +79 -24
nucliadb/search/search/query.py +74 -245
nucliadb/search/search/query_parser/exceptions.py +11 -1
nucliadb/search/search/query_parser/fetcher.py +405 -0
nucliadb/search/search/query_parser/models.py +0 -3
nucliadb/search/search/query_parser/parser.py +22 -21
nucliadb/search/search/rerankers.py +1 -42
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/api_router.py +2 -14
nucliadb/standalone/settings.py +4 -0
nucliadb/train/generators/field_streaming.py +7 -3
nucliadb/train/lifecycle.py +3 -6
nucliadb/train/nodes.py +14 -12
nucliadb/train/resource.py +380 -0
nucliadb/writer/api/constants.py +20 -16
nucliadb/writer/api/v1/__init__.py +1 -0
nucliadb/writer/api/v1/export_import.py +1 -1
nucliadb/writer/api/v1/field.py +13 -7
nucliadb/writer/api/v1/knowledgebox.py +3 -46
nucliadb/writer/api/v1/resource.py +20 -13
nucliadb/writer/api/v1/services.py +10 -1
nucliadb/writer/api/v1/upload.py +61 -34
nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/resource/basic.py +9 -7
nucliadb/writer/resource/field.py +42 -9
nucliadb/writer/settings.py +2 -2
nucliadb/writer/tus/gcs.py +11 -10
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
nucliadb/standalone/introspect.py +0 -208
nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
/nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0

nucliadb/common/cluster/standalone/utils.py CHANGED Viewed

@@ -19,13 +19,10 @@
 import logging
 import os
-import shutil
 import uuid
-from socket import gethostname
 from nucliadb.common.cluster.settings import StandaloneNodeRole
 from nucliadb.common.cluster.settings import settings as cluster_settings
-from nucliadb.common.cluster.standalone.index_node import StandaloneIndexNode
 logger = logging.getLogger(__name__)
@@ -46,46 +43,6 @@ def get_standalone_node_id() -> str:
         return str(uuid.UUID(bytes=f.read()))
-_SELF_INDEX_NODE = None
-def get_self() -> StandaloneIndexNode:
-    """
-    This returns an instance of the standalone index node
-    so when API requests come into this mode, we don't
-    make another grpc request since this node can service it directly.
-    """
-    if not is_index_node():
-        raise Exception("This node is not an Index Node. You should not reach this code path.")
-    global _SELF_INDEX_NODE
-    node_id = get_standalone_node_id()
-    if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
-        if "NUCLIADB_SERVICE_HOST" in os.environ:
-            hn = os.environ["HOSTNAME"]
-            ns = os.environ.get("NAMESPACE", "nucliadb")
-            host = f"{hn}.{ns}"
-        else:
-            host = gethostname()
-        _SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
-    try:
-        _, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
-        _SELF_INDEX_NODE.available_disk = available_disk
-    except FileNotFoundError:  # pragma: no cover
-        ...
-    try:
-        _shards_dir = os.path.join(cluster_settings.data_path, "shards")
-        _SELF_INDEX_NODE.shard_count = len(
-            [
-                shard_dir
-                for shard_dir in os.listdir(_shards_dir)
-                if os.path.isdir(os.path.join(_shards_dir, shard_dir))
-            ]
-        )
-    except FileNotFoundError:  # pragma: no cover
-        ...
-    return _SELF_INDEX_NODE
 def is_index_node() -> bool:
     return cluster_settings.standalone_node_role in (
         StandaloneNodeRole.ALL,

nucliadb/common/cluster/utils.py CHANGED Viewed

@@ -23,20 +23,11 @@ from typing import TYPE_CHECKING, Optional, Union
 import backoff
 from nucliadb.common import datamanagers
-from nucliadb.common.cluster.discovery.utils import (
-    setup_cluster_discovery,
-    teardown_cluster_discovery,
-)
 from nucliadb.common.cluster.manager import (
     KBShardManager,
     StandaloneKBShardManager,
-    clear_index_nodes,
 )
 from nucliadb.common.cluster.settings import settings
-from nucliadb.common.cluster.standalone.service import (
-    start_grpc as start_standalone_grpc,
-)
-from nucliadb.common.cluster.standalone.utils import is_index_node
 from nucliadb.ingest.orm.resource import Resource
 from nucliadb_protos import nodereader_pb2, writer_pb2
 from nucliadb_utils import const
@@ -62,12 +53,8 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
             # already setup
             return get_utility(Utility.SHARD_MANAGER)
-        await setup_cluster_discovery()
         mng: Union[KBShardManager, StandaloneKBShardManager]
         if settings.standalone_mode:
-            if is_index_node():
-                server = await start_standalone_grpc()
-                set_utility(_STANDALONE_SERVER, server)
             mng = StandaloneKBShardManager()
         else:
             mng = KBShardManager()
@@ -76,7 +63,6 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
 async def teardown_cluster():
-    await teardown_cluster_discovery()
     if get_utility(Utility.SHARD_MANAGER):
         clean_utility(Utility.SHARD_MANAGER)
@@ -85,8 +71,6 @@ async def teardown_cluster():
         await std_server.stop(None)
         clean_utility(_STANDALONE_SERVER)
-    clear_index_nodes()
 def get_shard_manager() -> KBShardManager:
     return get_utility(Utility.SHARD_MANAGER)  # type: ignore

nucliadb/common/counters.py CHANGED Viewed

@@ -26,3 +26,4 @@ class IndexCounts:
     fields: int
     paragraphs: int
     sentences: int
+    size_bytes: int

nucliadb/common/datamanagers/fields.py CHANGED Viewed

@@ -23,11 +23,13 @@ from typing import Optional
 from google.protobuf.message import Message
 from nucliadb.common.datamanagers.utils import get_kv_pb
+from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
 from nucliadb.common.maindb.driver import Transaction
 from nucliadb_protos import writer_pb2
 KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
 KB_RESOURCE_FIELD_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
+KB_RESOURCE_FIELD_STATUS = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/status"
 async def get_raw(
@@ -52,13 +54,7 @@ async def set(
 async def delete(txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str):
     base_key = KB_RESOURCE_FIELD.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
-    # Make sure we explicitly delete the field and any nested key
-    keys_to_delete = []
-    async for key in txn.keys(base_key):
-        keys_to_delete.append(key)
-    for key in keys_to_delete:
-        await txn.delete(key)
+    await txn.delete_by_prefix(base_key)
 # Error
@@ -82,3 +78,48 @@ async def set_error(
 ):
     key = KB_RESOURCE_FIELD_ERROR.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
     await txn.set(key, error.SerializeToString())
+# Status, replaces error
+async def get_status(
+    txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
+) -> Optional[writer_pb2.FieldStatus]:
+    key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
+    return await get_kv_pb(txn, key, writer_pb2.FieldStatus)
+async def get_statuses(
+    txn: Transaction, *, kbid: str, rid: str, fields: list[writer_pb2.FieldID]
+) -> list[writer_pb2.FieldStatus]:
+    keys = [
+        KB_RESOURCE_FIELD_STATUS.format(
+            kbid=kbid, uuid=rid, type=FIELD_TYPE_PB_TO_STR[fid.field_type], field=fid.field
+        )
+        for fid in fields
+    ]
+    serialized = await txn.batch_get(keys, for_update=False)
+    statuses = []
+    for serialized_status in serialized:
+        pb = writer_pb2.FieldStatus()
+        if serialized_status is not None:
+            pb.ParseFromString(serialized_status)
+        else:
+            pb = writer_pb2.FieldStatus()
+        statuses.append(pb)
+    return statuses
+async def set_status(
+    txn: Transaction,
+    *,
+    kbid: str,
+    rid: str,
+    field_type: str,
+    field_id: str,
+    status: writer_pb2.FieldStatus,
+):
+    key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
+    await txn.set(key, status.SerializeToString())

nucliadb/common/datamanagers/vectorsets.py CHANGED Viewed

@@ -58,6 +58,11 @@ async def iter(
         yield config.vectorset_id, config
+async def count(txn: Transaction, *, kbid: str) -> int:
+    kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
+    return len(kb_vectorsets.vectorsets)
 async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
     """Create or update a vectorset configuration"""
     kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
@@ -73,16 +78,20 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
     await txn.set(key, kb_vectorsets.SerializeToString())
-async def delete(txn: Transaction, *, kbid: str, vectorset_id: str):
+async def delete(
+    txn: Transaction, *, kbid: str, vectorset_id: str
+) -> Optional[knowledgebox_pb2.VectorSetConfig]:
     kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
     index = _find_vectorset(kb_vectorsets, vectorset_id)
     if index is None:
         # already deleted
-        return
+        return None
+    deleted = kb_vectorsets.vectorsets[index]
     del kb_vectorsets.vectorsets[index]
     key = KB_VECTORSETS.format(kbid=kbid)
     await txn.set(key, kb_vectorsets.SerializeToString())
+    return deleted
 # XXX At some point in the vectorset epic, we should make this key mandatory and

nucliadb/common/external_index_providers/base.py CHANGED Viewed

@@ -28,7 +28,7 @@ from nucliadb.common.counters import IndexCounts
 from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
 from nucliadb.common.ids import ParagraphId
 from nucliadb_models.external_index_providers import ExternalIndexProviderType
-from nucliadb_models.search import SCORE_TYPE, TextPosition
+from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
 from nucliadb_protos.knowledgebox_pb2 import (
     CreateExternalIndexProviderMetadata,
     StoredExternalIndexProviderMetadata,
@@ -73,6 +73,7 @@ class TextBlockMatch(BaseModel):
     paragraph_labels: list[str] = []
     field_labels: list[str] = []
     text: Optional[str] = None
+    relevant_relations: Optional[Relations] = None
 class QueryResults(BaseModel):

nucliadb/common/external_index_providers/pinecone.py CHANGED Viewed

@@ -441,6 +441,7 @@ class PineconeIndexManager(ExternalIndexManager):
     def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
         prefixes_to_delete = set()
+        # TODO: migrate to vector_prefixes_to_delete
         for field_id in index_data.sentences_to_delete:
             try:
                 delete_vid = VectorId.from_string(field_id)
@@ -706,11 +707,7 @@ class PineconeIndexManager(ExternalIndexManager):
         if self.kbid in COUNTERS_CACHE:
             # Cache hit
             return COUNTERS_CACHE[self.kbid]
-        total = IndexCounts(
-            fields=0,
-            paragraphs=0,
-            sentences=0,
-        )
+        total = IndexCounts(fields=0, paragraphs=0, sentences=0, size_bytes=0)
         tasks = []
         vectorset_results: dict[str, IndexCounts] = {}
@@ -738,6 +735,7 @@ class PineconeIndexManager(ExternalIndexManager):
                 fields=0,
                 paragraphs=index_stats.totalVectorCount,
                 sentences=index_stats.totalVectorCount,
+                size_bytes=0,
             )
         except Exception:
             logger.exception(

nucliadb/common/ids.py CHANGED Viewed

@@ -111,13 +111,11 @@ class FieldId:
         parts = value.split("/")
         if len(parts) == 3:
             rid, _type, key = parts
-            if _type not in FIELD_TYPE_STR_TO_PB:
-                raise ValueError(f"Invalid FieldId: {value}")
+            _type = cls.parse_field_type(_type)
             return cls(rid=rid, type=_type, key=key)
         elif len(parts) == 4:
             rid, _type, key, subfield_id = parts
-            if _type not in FIELD_TYPE_STR_TO_PB:
-                raise ValueError(f"Invalid FieldId: {value}")
+            _type = cls.parse_field_type(_type)
             return cls(
                 rid=rid,
                 type=_type,
@@ -127,6 +125,22 @@ class FieldId:
         else:
             raise ValueError(f"Invalid FieldId: {value}")
+    @classmethod
+    def parse_field_type(cls, _type: str) -> str:
+        if _type not in FIELD_TYPE_STR_TO_PB:
+            # Try to parse the enum value
+            # XXX: This is to support field types that are integer values of FieldType
+            # Which is how legacy processor relations reported the paragraph_id
+            try:
+                type_pb = FieldType.ValueType(int(_type))
+            except ValueError:
+                raise ValueError(f"Invalid FieldId: {_type}")
+            if type_pb in FIELD_TYPE_PB_TO_STR:
+                return FIELD_TYPE_PB_TO_STR[type_pb]
+            else:
+                raise ValueError(f"Invalid FieldId: {_type}")
+        return _type
 @dataclass
 class ParagraphId:

nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

nucliadb 6.2.0.post2679py3-none-any.whl → 6.2.1py3-none-any.whl