PyPI - nucliadb - Versions diffs - 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl - Mend

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

migrations/0023_backfill_pg_catalog.py +2 -2
migrations/0029_backfill_field_status.py +3 -4
migrations/0032_remove_old_relations.py +2 -3
migrations/0038_backfill_catalog_field_labels.py +2 -2
migrations/0039_backfill_converation_splits_metadata.py +2 -2
migrations/0041_reindex_conversations.py +137 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
migrations/pg/0012_catalog_statistics_undo.py +26 -0
nucliadb/backups/create.py +2 -15
nucliadb/backups/restore.py +4 -15
nucliadb/backups/tasks.py +4 -1
nucliadb/common/back_pressure/cache.py +2 -3
nucliadb/common/back_pressure/materializer.py +7 -13
nucliadb/common/back_pressure/settings.py +6 -6
nucliadb/common/back_pressure/utils.py +1 -0
nucliadb/common/cache.py +9 -9
nucliadb/common/catalog/interface.py +12 -12
nucliadb/common/catalog/pg.py +41 -29
nucliadb/common/catalog/utils.py +3 -3
nucliadb/common/cluster/manager.py +5 -4
nucliadb/common/cluster/rebalance.py +483 -114
nucliadb/common/cluster/rollover.py +25 -9
nucliadb/common/cluster/settings.py +3 -8
nucliadb/common/cluster/utils.py +34 -8
nucliadb/common/context/__init__.py +7 -8
nucliadb/common/context/fastapi.py +1 -2
nucliadb/common/datamanagers/__init__.py +2 -4
nucliadb/common/datamanagers/atomic.py +4 -2
nucliadb/common/datamanagers/cluster.py +1 -2
nucliadb/common/datamanagers/fields.py +3 -4
nucliadb/common/datamanagers/kb.py +6 -6
nucliadb/common/datamanagers/labels.py +2 -3
nucliadb/common/datamanagers/resources.py +10 -33
nucliadb/common/datamanagers/rollover.py +5 -7
nucliadb/common/datamanagers/search_configurations.py +1 -2
nucliadb/common/datamanagers/synonyms.py +1 -2
nucliadb/common/datamanagers/utils.py +4 -4
nucliadb/common/datamanagers/vectorsets.py +4 -4
nucliadb/common/external_index_providers/base.py +32 -5
nucliadb/common/external_index_providers/manager.py +4 -5
nucliadb/common/filter_expression.py +128 -40
nucliadb/common/http_clients/processing.py +12 -23
nucliadb/common/ids.py +6 -4
nucliadb/common/locking.py +1 -2
nucliadb/common/maindb/driver.py +9 -8
nucliadb/common/maindb/local.py +5 -5
nucliadb/common/maindb/pg.py +9 -8
nucliadb/common/nidx.py +3 -4
nucliadb/export_import/datamanager.py +4 -3
nucliadb/export_import/exporter.py +11 -19
nucliadb/export_import/importer.py +13 -6
nucliadb/export_import/tasks.py +2 -0
nucliadb/export_import/utils.py +6 -18
nucliadb/health.py +2 -2
nucliadb/ingest/app.py +8 -8
nucliadb/ingest/consumer/consumer.py +8 -10
nucliadb/ingest/consumer/pull.py +3 -8
nucliadb/ingest/consumer/service.py +3 -3
nucliadb/ingest/consumer/utils.py +1 -1
nucliadb/ingest/fields/base.py +28 -49
nucliadb/ingest/fields/conversation.py +12 -12
nucliadb/ingest/fields/exceptions.py +1 -2
nucliadb/ingest/fields/file.py +22 -8
nucliadb/ingest/fields/link.py +7 -7
nucliadb/ingest/fields/text.py +2 -3
nucliadb/ingest/orm/brain_v2.py +78 -64
nucliadb/ingest/orm/broker_message.py +2 -4
nucliadb/ingest/orm/entities.py +10 -209
nucliadb/ingest/orm/index_message.py +4 -4
nucliadb/ingest/orm/knowledgebox.py +18 -27
nucliadb/ingest/orm/processor/auditing.py +1 -3
nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
nucliadb/ingest/orm/processor/processor.py +27 -27
nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
nucliadb/ingest/orm/resource.py +72 -70
nucliadb/ingest/orm/utils.py +1 -1
nucliadb/ingest/processing.py +17 -17
nucliadb/ingest/serialize.py +202 -145
nucliadb/ingest/service/writer.py +3 -109
nucliadb/ingest/settings.py +3 -4
nucliadb/ingest/utils.py +1 -2
nucliadb/learning_proxy.py +11 -11
nucliadb/metrics_exporter.py +5 -4
nucliadb/middleware/__init__.py +82 -1
nucliadb/migrator/datamanager.py +3 -4
nucliadb/migrator/migrator.py +1 -2
nucliadb/migrator/models.py +1 -2
nucliadb/migrator/settings.py +1 -2
nucliadb/models/internal/augment.py +614 -0
nucliadb/models/internal/processing.py +19 -19
nucliadb/openapi.py +2 -2
nucliadb/purge/__init__.py +3 -8
nucliadb/purge/orphan_shards.py +1 -2
nucliadb/reader/__init__.py +5 -0
nucliadb/reader/api/models.py +6 -13
nucliadb/reader/api/v1/download.py +59 -38
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/learning_config.py +24 -4
nucliadb/reader/api/v1/resource.py +61 -9
nucliadb/reader/api/v1/services.py +18 -14
nucliadb/reader/app.py +3 -1
nucliadb/reader/reader/notifications.py +1 -2
nucliadb/search/api/v1/__init__.py +2 -0
nucliadb/search/api/v1/ask.py +3 -4
nucliadb/search/api/v1/augment.py +585 -0
nucliadb/search/api/v1/catalog.py +11 -15
nucliadb/search/api/v1/find.py +16 -22
nucliadb/search/api/v1/hydrate.py +25 -25
nucliadb/search/api/v1/knowledgebox.py +1 -2
nucliadb/search/api/v1/predict_proxy.py +1 -2
nucliadb/search/api/v1/resource/ask.py +7 -7
nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
nucliadb/search/api/v1/resource/search.py +9 -11
nucliadb/search/api/v1/retrieve.py +130 -0
nucliadb/search/api/v1/search.py +28 -32
nucliadb/search/api/v1/suggest.py +11 -14
nucliadb/search/api/v1/summarize.py +1 -2
nucliadb/search/api/v1/utils.py +2 -2
nucliadb/search/app.py +3 -2
nucliadb/search/augmentor/__init__.py +21 -0
nucliadb/search/augmentor/augmentor.py +232 -0
nucliadb/search/augmentor/fields.py +704 -0
nucliadb/search/augmentor/metrics.py +24 -0
nucliadb/search/augmentor/paragraphs.py +334 -0
nucliadb/search/augmentor/resources.py +238 -0
nucliadb/search/augmentor/utils.py +33 -0
nucliadb/search/lifecycle.py +3 -1
nucliadb/search/predict.py +24 -17
nucliadb/search/predict_models.py +8 -9
nucliadb/search/requesters/utils.py +11 -10
nucliadb/search/search/cache.py +19 -23
nucliadb/search/search/chat/ask.py +88 -59
nucliadb/search/search/chat/exceptions.py +3 -5
nucliadb/search/search/chat/fetcher.py +201 -0
nucliadb/search/search/chat/images.py +6 -4
nucliadb/search/search/chat/old_prompt.py +1375 -0
nucliadb/search/search/chat/parser.py +510 -0
nucliadb/search/search/chat/prompt.py +563 -615
nucliadb/search/search/chat/query.py +449 -36
nucliadb/search/search/chat/rpc.py +85 -0
nucliadb/search/search/fetch.py +3 -4
nucliadb/search/search/filters.py +8 -11
nucliadb/search/search/find.py +33 -31
nucliadb/search/search/find_merge.py +124 -331
nucliadb/search/search/graph_strategy.py +14 -12
nucliadb/search/search/hydrator/__init__.py +3 -152
nucliadb/search/search/hydrator/fields.py +92 -50
nucliadb/search/search/hydrator/images.py +7 -7
nucliadb/search/search/hydrator/paragraphs.py +42 -26
nucliadb/search/search/hydrator/resources.py +20 -16
nucliadb/search/search/ingestion_agents.py +5 -5
nucliadb/search/search/merge.py +90 -94
nucliadb/search/search/metrics.py +10 -9
nucliadb/search/search/paragraphs.py +7 -9
nucliadb/search/search/predict_proxy.py +13 -9
nucliadb/search/search/query.py +14 -86
nucliadb/search/search/query_parser/fetcher.py +51 -82
nucliadb/search/search/query_parser/models.py +19 -20
nucliadb/search/search/query_parser/old_filters.py +20 -19
nucliadb/search/search/query_parser/parsers/ask.py +4 -5
nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
nucliadb/search/search/query_parser/parsers/common.py +5 -6
nucliadb/search/search/query_parser/parsers/find.py +6 -26
nucliadb/search/search/query_parser/parsers/graph.py +13 -23
nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
nucliadb/search/search/query_parser/parsers/search.py +15 -53
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
nucliadb/search/search/rank_fusion.py +18 -13
nucliadb/search/search/rerankers.py +5 -6
nucliadb/search/search/retrieval.py +300 -0
nucliadb/search/search/summarize.py +5 -6
nucliadb/search/search/utils.py +3 -4
nucliadb/search/settings.py +1 -2
nucliadb/standalone/api_router.py +1 -1
nucliadb/standalone/app.py +4 -3
nucliadb/standalone/auth.py +5 -6
nucliadb/standalone/lifecycle.py +2 -2
nucliadb/standalone/run.py +2 -4
nucliadb/standalone/settings.py +5 -6
nucliadb/standalone/versions.py +3 -4
nucliadb/tasks/consumer.py +13 -8
nucliadb/tasks/models.py +2 -1
nucliadb/tasks/producer.py +3 -3
nucliadb/tasks/retries.py +8 -7
nucliadb/train/api/utils.py +1 -3
nucliadb/train/api/v1/shards.py +1 -2
nucliadb/train/api/v1/trainset.py +1 -2
nucliadb/train/app.py +1 -1
nucliadb/train/generator.py +4 -4
nucliadb/train/generators/field_classifier.py +2 -2
nucliadb/train/generators/field_streaming.py +6 -6
nucliadb/train/generators/image_classifier.py +2 -2
nucliadb/train/generators/paragraph_classifier.py +2 -2
nucliadb/train/generators/paragraph_streaming.py +2 -2
nucliadb/train/generators/question_answer_streaming.py +2 -2
nucliadb/train/generators/sentence_classifier.py +2 -2
nucliadb/train/generators/token_classifier.py +3 -2
nucliadb/train/generators/utils.py +6 -5
nucliadb/train/nodes.py +3 -3
nucliadb/train/resource.py +6 -8
nucliadb/train/settings.py +3 -4
nucliadb/train/types.py +11 -11
nucliadb/train/upload.py +3 -2
nucliadb/train/uploader.py +1 -2
nucliadb/train/utils.py +1 -2
nucliadb/writer/api/v1/export_import.py +4 -1
nucliadb/writer/api/v1/field.py +7 -11
nucliadb/writer/api/v1/knowledgebox.py +3 -4
nucliadb/writer/api/v1/resource.py +9 -20
nucliadb/writer/api/v1/services.py +10 -132
nucliadb/writer/api/v1/upload.py +73 -72
nucliadb/writer/app.py +8 -2
nucliadb/writer/resource/basic.py +12 -15
nucliadb/writer/resource/field.py +7 -5
nucliadb/writer/resource/origin.py +7 -0
nucliadb/writer/settings.py +2 -3
nucliadb/writer/tus/__init__.py +2 -3
nucliadb/writer/tus/azure.py +1 -3
nucliadb/writer/tus/dm.py +3 -3
nucliadb/writer/tus/exceptions.py +3 -4
nucliadb/writer/tus/gcs.py +5 -6
nucliadb/writer/tus/s3.py +2 -3
nucliadb/writer/tus/storage.py +3 -3
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
nucliadb/common/datamanagers/entities.py +0 -139
nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
{nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0

nucliadb/ingest/service/writer.py CHANGED Viewed

@@ -18,12 +18,11 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import uuid
-from typing import AsyncIterator
+from collections.abc import AsyncIterator
 from nucliadb.backups import tasks as backup_tasks
 from nucliadb.backups import utils as backup_utils
 from nucliadb.common import datamanagers
-from nucliadb.common.cluster.exceptions import AlreadyExists, EntitiesGroupNotFound
 from nucliadb.common.cluster.utils import get_shard_manager
 from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
 from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
@@ -49,7 +48,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
 )
 from nucliadb_protos.writer_pb2 import (
     BrokerMessage,
-    DelEntitiesRequest,
     GetEntitiesGroupRequest,
     GetEntitiesGroupResponse,
     GetEntitiesRequest,
@@ -58,12 +56,7 @@ from nucliadb_protos.writer_pb2 import (
     IndexStatus,
     ListEntitiesGroupsRequest,
     ListEntitiesGroupsResponse,
-    NewEntitiesGroupRequest,
-    NewEntitiesGroupResponse,
     OpStatusWriter,
-    SetEntitiesRequest,
-    UpdateEntitiesGroupRequest,
-    UpdateEntitiesGroupResponse,
     WriterStatusRequest,
     WriterStatusResponse,
 )
@@ -224,29 +217,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
             logger.info(f"Processed {message.uuid}")
         return response
-    async def NewEntitiesGroup(  # type: ignore
-        self, request: NewEntitiesGroupRequest, context=None
-    ) -> NewEntitiesGroupResponse:
-        response = NewEntitiesGroupResponse()
-        async with self.driver.ro_transaction() as ro_txn:
-            kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
-            if kbobj is None:
-                response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
-                return response
-        async with self.driver.rw_transaction() as txn:
-            kbobj.txn = txn
-            entities_manager = EntitiesManager(kbobj, txn)
-            try:
-                await entities_manager.create_entities_group(request.group, request.entities)
-            except AlreadyExists:
-                response.status = NewEntitiesGroupResponse.Status.ALREADY_EXISTS
-                return response
-            await txn.commit()
-            response.status = NewEntitiesGroupResponse.Status.OK
-            return response
     async def GetEntities(  # type: ignore
         self, request: GetEntitiesRequest, context=None
     ) -> GetEntitiesResponse:
@@ -320,81 +290,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
             return response
-    async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter:  # type: ignore
-        response = OpStatusWriter()
-        async with self.driver.ro_transaction() as ro_txn:
-            kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
-            if kbobj is None:
-                response.status = OpStatusWriter.Status.NOTFOUND
-                return response
-        async with self.driver.rw_transaction() as txn:
-            kbobj.txn = txn
-            entities_manager = EntitiesManager(kbobj, txn)
-            try:
-                await entities_manager.set_entities_group(request.group, request.entities)
-            except Exception as e:
-                errors.capture_exception(e)
-                logger.error("Error in ingest gRPC servicer", exc_info=True)
-                response.status = OpStatusWriter.Status.ERROR
-            else:
-                response.status = OpStatusWriter.Status.OK
-                await txn.commit()
-            return response
-    async def UpdateEntitiesGroup(  # type: ignore
-        self, request: UpdateEntitiesGroupRequest, context=None
-    ) -> UpdateEntitiesGroupResponse:
-        response = UpdateEntitiesGroupResponse()
-        async with self.driver.ro_transaction() as ro_txn:
-            kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
-            if kbobj is None:
-                response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
-                return response
-        async with self.driver.rw_transaction() as txn:
-            kbobj.txn = txn
-            entities_manager = EntitiesManager(kbobj, txn)
-            try:
-                await entities_manager.set_entities_group_metadata(
-                    request.group,
-                    title=request.title,
-                    color=request.color,
-                )
-                updates = {**request.add, **request.update}
-                await entities_manager.update_entities(request.group, updates)
-                await entities_manager.delete_entities(request.group, request.delete)  # type: ignore
-            except EntitiesGroupNotFound:
-                response.status = UpdateEntitiesGroupResponse.Status.ENTITIES_GROUP_NOT_FOUND
-                return response
-            await txn.commit()
-            response.status = UpdateEntitiesGroupResponse.Status.OK
-            return response
-    async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter:  # type: ignore
-        response = OpStatusWriter()
-        async with self.driver.ro_transaction() as ro_txn:
-            kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
-            if kbobj is None:
-                response.status = OpStatusWriter.Status.NOTFOUND
-                return response
-        async with self.driver.rw_transaction() as txn:
-            kbobj.txn = txn
-            entities_manager = EntitiesManager(kbobj, txn)
-            try:
-                await entities_manager.delete_entities_group(request.group)
-            except Exception as e:
-                errors.capture_exception(e)
-                logger.error("Error in ingest gRPC servicer", exc_info=True)
-                response.status = OpStatusWriter.Status.ERROR
-            else:
-                await txn.commit()
-                response.status = OpStatusWriter.Status.OK
-            return response
     async def Status(  # type: ignore
         self, request: WriterStatusRequest, context=None
     ) -> WriterStatusResponse:
@@ -413,8 +308,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
     async def Index(self, request: IndexResource, context=None) -> IndexStatus:  # type: ignore
         async with self.driver.ro_transaction() as txn:
-            kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
-            resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
+            resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
             bm = await generate_broker_message(resobj)
             transaction = get_transaction_utility()
             partitioning = get_partitioning()
@@ -428,7 +322,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         try:
             async with self.driver.rw_transaction() as txn:
                 kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
-                resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
+                resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
                 resobj.disable_vectors = not request.reindex_vectors
                 index_message = await get_resource_index_message(resobj, reindex=True)
                 shard = await self.proc.get_or_assign_resource_shard(txn, kbobj, request.rid)

nucliadb/ingest/settings.py CHANGED Viewed

@@ -18,7 +18,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 from enum import Enum
-from typing import Optional
 from pydantic import Field
 from pydantic_settings import BaseSettings
@@ -41,13 +40,13 @@ class DriverConfig(Enum):
 class DriverSettings(BaseSettings):
     driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
-    driver_local_url: Optional[str] = Field(
+    driver_local_url: str | None = Field(
         default=None,
         description="Local path to store data on file system. Example: /nucliadb/data/main",
     )
-    driver_pg_url: Optional[str] = Field(
+    driver_pg_url: str | None = Field(
         default=None,
-        description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.",  # noqa
+        description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.",
     )
     driver_pg_connection_pool_min_size: int = Field(
         default=10,

nucliadb/ingest/utils.py CHANGED Viewed

@@ -17,7 +17,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 from nucliadb.common.maindb.utils import setup_driver
 from nucliadb_protos.writer_pb2_grpc import WriterStub
@@ -26,7 +25,7 @@ from nucliadb_utils.settings import nucliadb_settings
 from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
-async def start_ingest(service_name: Optional[str] = None):
+async def start_ingest(service_name: str | None = None):
     await setup_driver()
     actual_service = get_utility(Utility.INGEST)

nucliadb/learning_proxy.py CHANGED Viewed

@@ -24,7 +24,7 @@ import os
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
 from enum import Enum, IntEnum
-from typing import Any, Optional, Union
+from typing import Any
 import backoff
 import httpx
@@ -89,12 +89,12 @@ class LearningConfiguration(BaseModel):
     # aka similarity function
     semantic_vector_similarity: str
     # aka vector_dimension
-    semantic_vector_size: Optional[int] = None
+    semantic_vector_size: int | None = None
     # aka min_score
-    semantic_threshold: Optional[float] = None
+    semantic_threshold: float | None = None
     # List of possible subdivisions of the matryoshka embeddings (if the model
     # supports it)
-    semantic_matryoshka_dimensions: Optional[list[int]] = Field(
+    semantic_matryoshka_dimensions: list[int] | None = Field(
         default=None, alias="semantic_matryoshka_dims"
     )
@@ -154,7 +154,7 @@ class LearningConfiguration(BaseModel):
 class ProxiedLearningConfigError(Exception):
-    def __init__(self, status_code: int, content: Union[str, dict[str, Any]]):
+    def __init__(self, status_code: int, content: str | dict[str, Any]):
         self.status_code = status_code
         self.content = content
@@ -176,7 +176,7 @@ def raise_for_status(response: httpx.Response) -> None:
 async def get_configuration(
     kbid: str,
-) -> Optional[LearningConfiguration]:
+) -> LearningConfiguration | None:
     return await learning_config_service().get_configuration(kbid)
@@ -205,7 +205,7 @@ async def learning_config_proxy(
     method: str,
     url: str,
     headers: dict[str, str] = {},
-) -> Union[Response, StreamingResponse]:
+) -> Response | StreamingResponse:
     return await proxy(
         service=LearningService.CONFIG,
         request=request,
@@ -245,7 +245,7 @@ async def proxy(
     method: str,
     url: str,
     headers: dict[str, str] = {},
-) -> Union[Response, StreamingResponse]:
+) -> Response | StreamingResponse:
     """
     Proxy the request to a learning API.
@@ -426,7 +426,7 @@ class DummyClient(httpx.AsyncClient):
 class LearningConfigService(ABC):
     @abstractmethod
-    async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
+    async def get_configuration(self, kbid: str) -> LearningConfiguration | None: ...
     @abstractmethod
     async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
@@ -439,7 +439,7 @@ class LearningConfigService(ABC):
 class ProxiedLearningConfig(LearningConfigService):
-    async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
+    async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
         async with self._client() as client:
             resp = await client.get(f"config/{kbid}")
             try:
@@ -483,7 +483,7 @@ class InMemoryLearningConfig(LearningConfigService):
     def __init__(self):
         self.in_memory_configs = {}
-    async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
+    async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
         return _IN_MEMORY_CONFIGS.get(kbid, None)
     async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:

nucliadb/metrics_exporter.py CHANGED Viewed

@@ -20,7 +20,8 @@
 from __future__ import annotations
 import asyncio
-from typing import AsyncGenerator, Callable, Tuple, cast
+from collections.abc import AsyncGenerator, Callable
+from typing import cast
 from nucliadb import logger
 from nucliadb.common import datamanagers
@@ -41,7 +42,7 @@ KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
 RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
-async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
+async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str]:
     """
     Return a list of all KB ids.
     """
@@ -94,11 +95,11 @@ async def update_resource_metrics(context: ApplicationContext):
             "WHERE labels @> '{/n/s/PENDING}' "
             "AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
         )
-        count = cast(Tuple[int], await cur.fetchone())[0]
+        count = cast(tuple[int], await cur.fetchone())[0]
         PENDING_RESOURCE_COUNT.set(count)
         await cur.execute("SELECT COUNT(*) FROM catalog")
-        count = cast(Tuple[int], await cur.fetchone())[0]
+        count = cast(tuple[int], await cur.fetchone())[0]
         RESOURCES_COUNT.set(count)

nucliadb/middleware/__init__.py CHANGED Viewed

@@ -17,7 +17,10 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
+import logging
 import time
+from collections import deque
+from typing import ClassVar
 from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
 from starlette.requests import Request
@@ -27,6 +30,9 @@ PROCESS_TIME_HEADER = "X-PROCESS-TIME"
 ACCESS_CONTROL_EXPOSE_HEADER = "Access-Control-Expose-Headers"
+logger = logging.getLogger("nucliadb.middleware")
 class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
     def capture_process_time(self, response, duration: float):
         response.headers[PROCESS_TIME_HEADER] = str(duration)
@@ -44,9 +50,84 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
         start = time.perf_counter()
         try:
             response = await call_next(request)
+            return response
         finally:
             if response is not None:
                 duration = time.perf_counter() - start
                 self.capture_process_time(response, duration)
                 self.expose_process_time_header(response)
-                return response
+class ClientErrorPayloadLoggerMiddleware(BaseHTTPMiddleware):
+    """
+    Middleware that logs the payload of client error responses (HTTP 412 and 422).
+    This helps supporting clients by providing more context about the errors they
+    encounter which otherwise we don't have much visibility on.
+    There is a limit of logs per IP to avoid flooding the logs in case of
+    misbehaving clients.
+    """
+    log_counters: ClassVar[dict[str, "HourlyLogCounter"]] = {}
+    max_logs: int = 200
+    def get_request_host(self, request: Request) -> str:
+        return request.client.host if request.client else "unknown"
+    async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
+        response = await call_next(request)
+        host = self.get_request_host(request)
+        counter = self.log_counters.setdefault(host, HourlyLogCounter())
+        if response.status_code in (412, 422) and counter.get_count() < self.max_logs:
+            counter.log_event()
+            response_body = b""
+            chunk: bytes
+            async for chunk in response.body_iterator:  # type: ignore
+                response_body += chunk
+            logger.info(
+                f"Client payload validation error",
+                extra={
+                    "request_method": request.method,
+                    "request_path": request.url.path,
+                    "response_status_code": response.status_code,
+                    "response_payload": response_body.decode("utf-8", errors="replace"),
+                },
+            )
+            # Recreate the response body iterator since it has been consumed
+            response = Response(
+                content=response_body,
+                status_code=response.status_code,
+                headers=dict(response.headers),
+                media_type=response.media_type,
+                background=response.background,
+            )
+        return response
+class EventCounter:
+    def __init__(self, window_seconds: int = 3600):
+        self.window_seconds = window_seconds
+        self.events: deque[float] = deque()
+    def log_event(self):
+        current_time = time.time()
+        # Remove events older than the window
+        while self.events and self.events[0] < current_time - self.window_seconds:
+            self.events.popleft()
+        # Add current event
+        self.events.append(current_time)
+    def get_count(self) -> int:
+        current_time = time.time()
+        # Remove old events and return count
+        while self.events and self.events[0] < current_time - self.window_seconds:
+            self.events.popleft()
+        return len(self.events)
+class HourlyLogCounter(EventCounter):
+    def __init__(self):
+        super().__init__(window_seconds=3600)

nucliadb/migrator/datamanager.py CHANGED Viewed

@@ -17,7 +17,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional, Union
 from nucliadb.common import datamanagers
 from nucliadb.common.maindb.driver import Driver
@@ -69,7 +68,7 @@ class MigrationsDataManager:
             await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
             await txn.commit()
-    async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
+    async def get_kb_info(self, kbid: str) -> KnowledgeBoxInfo | None:
         async with self.driver.ro_transaction() as txn:
             kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
             if kb_config is None:
@@ -91,8 +90,8 @@ class MigrationsDataManager:
     async def update_global_info(
         self,
         *,
-        current_version: Union[int, _Unset] = _UNSET,
-        target_version: Union[int, None, _Unset] = _UNSET,
+        current_version: int | _Unset = _UNSET,
+        target_version: int | None | _Unset = _UNSET,
     ) -> None:
         async with self.driver.rw_transaction() as txn:
             raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)

nucliadb/migrator/migrator.py CHANGED Viewed

@@ -19,7 +19,6 @@
 #
 import asyncio
 import logging
-from typing import Optional
 from nucliadb.common import locking
 from nucliadb.common.cluster.rollover import rollover_kb_index
@@ -233,7 +232,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
                 await tx.commit()
-async def run(context: ExecutionContext, target_version: Optional[int] = None) -> None:
+async def run(context: ExecutionContext, target_version: int | None = None) -> None:
     # Run schema migrations first, since they create the `resources` table needed for the lock below
     # Schema migrations use their own locking system
     if isinstance(context.kv_driver, PGDriver):

nucliadb/migrator/models.py CHANGED Viewed

@@ -19,7 +19,6 @@
 #
 import types
 from dataclasses import dataclass
-from typing import Optional
 @dataclass
@@ -36,4 +35,4 @@ class KnowledgeBoxInfo:
 @dataclass
 class GlobalInfo:
     current_version: int
-    target_version: Optional[int] = None
+    target_version: int | None = None

nucliadb/migrator/settings.py CHANGED Viewed

@@ -17,14 +17,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from typing import Optional
 import pydantic
 import pydantic_settings
 class Settings(pydantic_settings.BaseSettings):
-    redis_url: Optional[str] = None
+    redis_url: str | None = None
     max_concurrent_migrations: int = pydantic.Field(
         default=5,
         description="Maximum number of concurrent KB migrations allowed.",

nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

nucliadb 6.9.1.post5192py3-none-any.whl → 6.10.0.post5705py3-none-any.whl