PyPI - nucliadb - Versions diffs - 6.2.1.post2971__py3-none-any.whl → 6.2.1.post2972__py3-none-any.whl - Mend

nucliadb 6.2.1.post2971py3-none-any.whl → 6.2.1.post2972py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

nucliadb/common/cluster/manager.py +33 -331
nucliadb/common/cluster/rebalance.py +2 -2
nucliadb/common/cluster/rollover.py +12 -71
nucliadb/common/cluster/standalone/utils.py +0 -43
nucliadb/common/cluster/utils.py +0 -16
nucliadb/common/nidx.py +21 -23
nucliadb/health.py +0 -7
nucliadb/ingest/app.py +0 -8
nucliadb/ingest/consumer/auditing.py +1 -1
nucliadb/ingest/consumer/shard_creator.py +1 -1
nucliadb/ingest/orm/entities.py +3 -6
nucliadb/purge/orphan_shards.py +6 -4
nucliadb/search/api/v1/knowledgebox.py +1 -5
nucliadb/search/requesters/utils.py +1 -2
nucliadb/search/search/shards.py +19 -0
nucliadb/standalone/introspect.py +0 -25
nucliadb/train/lifecycle.py +0 -6
nucliadb/train/nodes.py +1 -5
nucliadb/writer/back_pressure.py +17 -46
nucliadb/writer/settings.py +2 -2
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/METADATA +5 -7
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/RECORD +26 -36
nucliadb/common/cluster/discovery/__init__.py +0 -19
nucliadb/common/cluster/discovery/base.py +0 -178
nucliadb/common/cluster/discovery/k8s.py +0 -301
nucliadb/common/cluster/discovery/manual.py +0 -57
nucliadb/common/cluster/discovery/single.py +0 -51
nucliadb/common/cluster/discovery/types.py +0 -32
nucliadb/common/cluster/discovery/utils.py +0 -67
nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
nucliadb/common/cluster/standalone/index_node.py +0 -123
nucliadb/common/cluster/standalone/service.py +0 -84
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/WHEEL +0 -0
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/entry_points.txt +0 -0
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/top_level.txt +0 -0
{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/zip-safe +0 -0

nucliadb/common/cluster/standalone/grpc_node_binding.py DELETED Viewed

@@ -1,349 +0,0 @@
-# Copyright (C) 2021 Bosutech XXI S.L.
-#
-# nucliadb is offered under the AGPL v3.0 and as commercial software.
-# For commercial licensing, contact us at info@nuclia.com.
-#
-# AGPL:
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-from __future__ import annotations
-import asyncio
-import logging
-import os
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from typing import AsyncIterator
-from nucliadb_protos.nodereader_pb2 import (
-    DocumentItem,
-    EdgeList,
-    GetShardRequest,
-    IdCollection,
-    ParagraphItem,
-    ParagraphSearchRequest,
-    ParagraphSearchResponse,
-    RelationSearchRequest,
-    RelationSearchResponse,
-    SearchRequest,
-    SearchResponse,
-    StreamRequest,
-    SuggestRequest,
-    SuggestResponse,
-)
-from nucliadb_protos.noderesources_pb2 import (
-    EmptyQuery,
-    EmptyResponse,
-    Resource,
-    ResourceID,
-    ShardCreated,
-    ShardId,
-    ShardIds,
-    VectorSetID,
-    VectorSetList,
-)
-from nucliadb_protos.noderesources_pb2 import Shard as NodeResourcesShard
-from nucliadb_protos.nodewriter_pb2 import NewShardRequest, OpStatus
-from ..settings import settings
-logger = logging.getLogger(__name__)
-try:
-    from nucliadb_node_binding import IndexNodeException  # type: ignore
-except ImportError:  # pragma: no cover
-    logger.warning("Import error while importing IndexNodeException")
-    IndexNodeException = Exception
-try:
-    from nucliadb_node_binding import NodeReader, NodeWriter
-except ImportError:  # pragma: no cover
-    NodeReader = None
-    NodeWriter = None
-class StandaloneReaderWrapper:
-    reader: NodeReader
-    def __init__(self):
-        if NodeReader is None:
-            raise ImportError("NucliaDB index node bindings are not installed (reader not found)")
-        self.reader = NodeReader()
-        self.executor = ThreadPoolExecutor(settings.local_reader_threads)
-    async def Search(self, request: SearchRequest, retry: bool = False) -> SearchResponse:
-        try:
-            loop = asyncio.get_running_loop()
-            result = await loop.run_in_executor(
-                self.executor, self.reader.search, request.SerializeToString()
-            )
-            pb_bytes = bytes(result)
-            pb = SearchResponse()
-            pb.ParseFromString(pb_bytes)
-            return pb
-        except IndexNodeException as exc:
-            if "IO error" not in str(exc):
-                # ignore any other error
-                raise
-            # try some mitigations...
-            logger.error(f"IndexNodeException in Search: {request}", exc_info=True)
-            if not retry:
-                # reinit?
-                self.reader = NodeReader()
-                return await self.Search(request, retry=True)
-            else:
-                raise
-    async def GetShard(self, request: GetShardRequest) -> NodeResourcesShard:
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(
-            self.executor, self.reader.get_shard, request.SerializeToString()
-        )
-        pb_bytes = bytes(result)
-        shard = NodeResourcesShard()
-        shard.ParseFromString(pb_bytes)
-        return shard
-    async def Suggest(self, request: SuggestRequest) -> SuggestResponse:
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(
-            self.executor, self.reader.suggest, request.SerializeToString()
-        )
-        pb_bytes = bytes(result)
-        pb = SuggestResponse()
-        pb.ParseFromString(pb_bytes)
-        return pb
-    async def Documents(
-        self, stream_request: StreamRequest
-    ) -> AsyncIterator[DocumentItem]:  # pragma: no cover
-        """
-        This is a workaround for the fact that the node binding does not support async generators.
-        Very difficult to write tests for
-        """
-        loop = asyncio.get_running_loop()
-        q: asyncio.Queue[DocumentItem] = asyncio.Queue(1)
-        exception = None
-        _END = object()
-        def thread_generator():
-            nonlocal exception
-            generator = self.reader.documents(stream_request.SerializeToString())
-            try:
-                element = generator.next()
-                while element is not None:
-                    pb_bytes = bytes(element)
-                    pb = DocumentItem()
-                    pb.ParseFromString(pb_bytes)
-                    asyncio.run_coroutine_threadsafe(q.put(pb), loop).result()
-                    element = generator.next()
-            except StopIteration:
-                # this is the end
-                pass
-            except Exception as e:
-                exception = e
-            finally:
-                asyncio.run_coroutine_threadsafe(q.put(_END), loop).result()
-        t1 = threading.Thread(target=thread_generator)
-        t1.start()
-        while True:
-            next_item = await q.get()
-            if next_item is _END:
-                break
-            yield next_item
-        if exception is not None:
-            raise exception
-        await loop.run_in_executor(self.executor, t1.join)
-    async def Paragraphs(self, stream_request: StreamRequest) -> AsyncIterator[ParagraphItem]:
-        loop = asyncio.get_running_loop()
-        q: asyncio.Queue[ParagraphItem] = asyncio.Queue(1)
-        exception = None
-        _END = object()
-        def thread_generator():
-            nonlocal exception
-            generator = self.reader.paragraphs(stream_request.SerializeToString())
-            try:
-                element = generator.next()
-                while element is not None:
-                    pb_bytes = bytes(element)
-                    pb = ParagraphItem()
-                    pb.ParseFromString(pb_bytes)
-                    asyncio.run_coroutine_threadsafe(q.put(pb), loop).result()
-                    element = generator.next()
-            except StopIteration:
-                # this is the end
-                pass
-            except Exception as e:
-                exception = e
-            finally:
-                asyncio.run_coroutine_threadsafe(q.put(_END), loop).result()
-        t1 = threading.Thread(target=thread_generator)
-        t1.start()
-        while True:
-            next_item = await q.get()
-            if next_item is _END:
-                break
-            yield next_item
-        if exception is not None:
-            raise exception
-        await loop.run_in_executor(self.executor, t1.join)
-    async def RelationEdges(self, request: ShardId):
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(
-            self.executor, self.reader.relation_edges, request.SerializeToString()
-        )
-        pb_bytes = bytes(result)
-        edge_list = EdgeList()
-        edge_list.ParseFromString(pb_bytes)
-        return edge_list
-    async def VectorIds(self, request: VectorSetID) -> IdCollection:
-        loop = asyncio.get_running_loop()
-        result = await loop.run_in_executor(
-            self.executor, self.reader.vector_ids, request.SerializeToString()
-        )
-        pb_bytes = bytes(result)
-        ids = IdCollection()
-        ids.ParseFromString(pb_bytes)
-        return ids
-class StandaloneWriterWrapper:
-    writer: NodeWriter
-    def __init__(self):
-        os.makedirs(settings.data_path, exist_ok=True)
-        if NodeWriter is None:
-            raise ImportError("NucliaDB index node bindings are not installed (writer not found)")
-        self.writer = NodeWriter()
-        self.executor = ThreadPoolExecutor(settings.local_writer_threads)
-    async def NewShard(self, request: NewShardRequest) -> ShardCreated:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.new_shard, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        shard_created = ShardCreated()
-        shard_created.ParseFromString(pb_bytes)
-        return shard_created
-    async def DeleteShard(self, request: ShardId) -> ShardId:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.delete_shard, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        shard_id = ShardId()
-        shard_id.ParseFromString(pb_bytes)
-        return shard_id
-    async def ListShards(self, request: EmptyQuery) -> ShardIds:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor,
-            self.writer.list_shards,
-        )
-        pb_bytes = bytes(resp)
-        shard_ids = ShardIds()
-        shard_ids.ParseFromString(pb_bytes)
-        return shard_ids
-    async def AddVectorSet(self, request: VectorSetID):
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.add_vectorset, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        resp = OpStatus()
-        resp.ParseFromString(pb_bytes)
-        return resp
-    async def ListVectorSets(self, request: ShardId):
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.list_vectorsets, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        resp = VectorSetList()
-        resp.ParseFromString(pb_bytes)
-        return resp
-    async def RemoveVectorSet(self, request: VectorSetID):
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.remove_vectorset, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        resp = OpStatus()
-        resp.ParseFromString(pb_bytes)
-        return resp
-    async def SetResource(self, request: Resource) -> OpStatus:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.set_resource, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        op_status = OpStatus()
-        op_status.ParseFromString(pb_bytes)
-        return op_status
-    async def RemoveResource(self, request: ResourceID) -> OpStatus:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(
-            self.executor, self.writer.remove_resource, request.SerializeToString()
-        )
-        pb_bytes = bytes(resp)
-        op_status = OpStatus()
-        op_status.ParseFromString(pb_bytes)
-        return op_status
-    async def GC(self, request: ShardId) -> EmptyResponse:
-        loop = asyncio.get_running_loop()
-        resp = await loop.run_in_executor(self.executor, self.writer.gc, request.SerializeToString())
-        pb_bytes = bytes(resp)
-        op_status = EmptyResponse()
-        op_status.ParseFromString(pb_bytes)
-        return op_status
-# supported marshalled reader methods for standalone node support
-READER_METHODS = {
-    "Search": (SearchRequest, SearchResponse),
-    "ParagraphSearch": (ParagraphSearchRequest, ParagraphSearchResponse),
-    "RelationSearch": (RelationSearchRequest, RelationSearchResponse),
-    "GetShard": (GetShardRequest, NodeResourcesShard),
-    "Suggest": (SuggestRequest, SuggestResponse),
-    "RelationEdges": (ShardId, EdgeList),
-}
-WRITER_METHODS = {
-    "NewShard": (NewShardRequest, ShardCreated),
-    "DeleteShard": (ShardId, ShardId),
-    "ListShards": (EmptyQuery, ShardIds),
-    "RemoveVectorSet": (VectorSetID, OpStatus),
-    "AddVectorSet": (VectorSetID, OpStatus),
-    "ListVectorSets": (ShardId, VectorSetList),
-    "SetResource": (Resource, OpStatus),
-    "RemoveResource": (ResourceID, OpStatus),
-    "GC": (ShardId, EmptyResponse),
-}

nucliadb/common/cluster/standalone/index_node.py DELETED Viewed

@@ -1,123 +0,0 @@
-# Copyright (C) 2021 Bosutech XXI S.L.
-#
-# nucliadb is offered under the AGPL v3.0 and as commercial software.
-# For commercial licensing, contact us at info@nuclia.com.
-#
-# AGPL:
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-from typing import Any, Optional
-from nucliadb.common.cluster.base import AbstractIndexNode
-from nucliadb.common.cluster.grpc_node_dummy import DummyReaderStub, DummyWriterStub
-from nucliadb.common.cluster.settings import settings as cluster_settings
-from nucliadb.common.cluster.standalone import grpc_node_binding
-from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
-from nucliadb_utils.grpc import get_traced_grpc_channel
-class StandaloneIndexNode(AbstractIndexNode):
-    _writer: grpc_node_binding.StandaloneWriterWrapper
-    _reader: grpc_node_binding.StandaloneReaderWrapper
-    label: str = "standalone"
-    def __init__(
-        self,
-        id: str,
-        address: str,
-        shard_count: int,
-        available_disk: int,
-        dummy: bool = False,
-        primary_id: Optional[str] = None,
-    ):
-        super().__init__(
-            id=id,
-            address=address,
-            shard_count=shard_count,
-            available_disk=available_disk,
-            dummy=dummy,
-            # standalone does not support read replicas
-            primary_id=None,
-        )
-        if dummy:
-            self._writer = DummyWriterStub()  # type: ignore
-            self._reader = DummyReaderStub()  # type: ignore
-        else:
-            self._writer = grpc_node_binding.StandaloneWriterWrapper()
-            self._reader = grpc_node_binding.StandaloneReaderWrapper()
-    @property
-    def reader(self) -> grpc_node_binding.StandaloneReaderWrapper:  # type: ignore
-        return self._reader
-    @property
-    def writer(self) -> grpc_node_binding.StandaloneWriterWrapper:  # type: ignore
-        return self._writer
-class ProxyCallerWrapper:
-    def __init__(self, address: str, type: str, original_type: Any):
-        self._address = address
-        self._type = type
-        self._original_type = original_type
-        if ":" not in address:
-            grpc_address = f"{address}:{cluster_settings.standalone_node_port}"
-        else:
-            grpc_address = address
-        self._channel = get_traced_grpc_channel(grpc_address, "standalone_proxy")
-        self._stub = standalone_pb2_grpc.StandaloneClusterServiceStub(self._channel)
-    def __getattr__(self, name):
-        async def call(request):
-            req = standalone_pb2.NodeActionRequest(
-                service=self._type, action=name, payload=request.SerializeToString()
-            )
-            resp = await self._stub.NodeAction(req)
-            try:
-                if self._type == "reader":
-                    _, return_type = grpc_node_binding.READER_METHODS[name]
-                elif self._type == "writer":
-                    _, return_type = grpc_node_binding.WRITER_METHODS[name]
-                else:
-                    raise NotImplementedError(f"Unknown type {self._type}")
-            except KeyError:
-                raise NotImplementedError(f"Unknown method for type {self._type}: {name}")
-            return_value = return_type()
-            return_value.ParseFromString(resp.payload)
-            return return_value
-        return call
-class ProxyStandaloneIndexNode(StandaloneIndexNode):
-    label: str = "proxy_standalone"
-    def __init__(
-        self,
-        id: str,
-        address: str,
-        shard_count: int,
-        available_disk: int,
-        dummy: bool = False,
-    ):
-        super().__init__(id, address, shard_count, available_disk=available_disk, dummy=dummy)
-        if dummy:
-            return
-        self._writer = ProxyCallerWrapper(  # type: ignore
-            address, "writer", grpc_node_binding.StandaloneWriterWrapper
-        )
-        self._reader = ProxyCallerWrapper(  # type: ignore
-            address, "reader", grpc_node_binding.StandaloneReaderWrapper
-        )

nucliadb/common/cluster/standalone/service.py DELETED Viewed

@@ -1,84 +0,0 @@
-# Copyright (C) 2021 Bosutech XXI S.L.
-#
-# nucliadb is offered under the AGPL v3.0 and as commercial software.
-# For commercial licensing, contact us at info@nuclia.com.
-#
-# AGPL:
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-import os
-import shutil
-import backoff
-from grpc import aio
-from grpc.aio import AioRpcError
-from nucliadb.common.cluster.settings import settings
-from nucliadb.common.cluster.settings import settings as cluster_settings
-from nucliadb.common.cluster.standalone import grpc_node_binding
-from nucliadb.common.cluster.standalone.utils import get_self
-from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
-from nucliadb_utils.grpc import get_traced_grpc_server
-class StandaloneClusterServiceServicer(standalone_pb2_grpc.StandaloneClusterServiceServicer):
-    @backoff.on_exception(backoff.expo, (AioRpcError,), max_time=60)
-    async def NodeAction(  # type: ignore
-        self, request: standalone_pb2.NodeActionRequest, context
-    ) -> standalone_pb2.NodeActionResponse:
-        service = request.service
-        action = request.action
-        try:
-            if service == "reader":
-                request_type, _ = grpc_node_binding.READER_METHODS[action]
-            elif service == "writer":
-                request_type, _ = grpc_node_binding.WRITER_METHODS[action]
-            else:
-                raise NotImplementedError(f"Unknown type {service}")
-        except KeyError:
-            raise NotImplementedError(f"Unknown method for type {service}: {action}")
-        index_node_action = getattr(getattr(get_self(), service), action)
-        action_request = request_type()
-        action_request.ParseFromString(request.payload)
-        response = await index_node_action(action_request)
-        return standalone_pb2.NodeActionResponse(payload=response.SerializeToString())
-    async def NodeInfo(  # type: ignore
-        self, request: standalone_pb2.NodeInfoRequest, context
-    ) -> standalone_pb2.NodeInfoResponse:
-        index_node = get_self()
-        index_node.shard_count = len(os.listdir(os.path.join(cluster_settings.data_path, "shards")))
-        total_disk, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
-        return standalone_pb2.NodeInfoResponse(
-            id=index_node.id,
-            address=index_node.address,
-            shard_count=index_node.shard_count,
-            available_disk=available_disk,
-            total_disk=total_disk,
-        )
-async def start_grpc():
-    aio.init_grpc_aio()
-    server = get_traced_grpc_server("standalone")
-    servicer = StandaloneClusterServiceServicer()
-    server.add_insecure_port(f"0.0.0.0:{settings.standalone_node_port}")
-    standalone_pb2_grpc.add_StandaloneClusterServiceServicer_to_server(servicer, server)
-    await server.start()
-    return server

{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/top_level.txt RENAMED Viewed

File without changes

{nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/zip-safe RENAMED Viewed

File without changes

nucliadb 6.2.1.post2971__py3-none-any.whl → 6.2.1.post2972__py3-none-any.whl

nucliadb 6.2.1.post2971py3-none-any.whl → 6.2.1.post2972py3-none-any.whl