PyPI - nucliadb - Versions diffs - 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl - Mend

nucliadb 6.7.2.post4862py3-none-any.whl → 6.9.2.post5282py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show

migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
migrations/0017_multiple_writable_shards.py +1 -1
migrations/0018_purge_orphan_kbslugs.py +1 -1
migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
migrations/0021_overwrite_vectorsets_key.py +1 -1
migrations/0023_backfill_pg_catalog.py +7 -3
migrations/0025_assign_models_to_kbs_v2.py +3 -3
migrations/0027_rollover_texts3.py +1 -1
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +1 -1
migrations/0032_remove_old_relations.py +1 -1
migrations/0036_backfill_catalog_slug.py +1 -1
migrations/0037_backfill_catalog_facets.py +1 -1
migrations/0038_backfill_catalog_field_labels.py +7 -3
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/backups/create.py +3 -3
nucliadb/backups/restore.py +3 -3
nucliadb/common/cache.py +1 -1
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +3 -19
nucliadb/common/cluster/rebalance.py +484 -110
nucliadb/common/cluster/rollover.py +29 -0
nucliadb/common/cluster/settings.py +1 -1
nucliadb/common/cluster/utils.py +26 -0
nucliadb/common/datamanagers/atomic.py +6 -0
nucliadb/common/datamanagers/utils.py +2 -2
nucliadb/common/external_index_providers/manager.py +1 -29
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +16 -33
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +4 -0
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +77 -55
nucliadb/common/locking.py +4 -4
nucliadb/common/maindb/driver.py +11 -1
nucliadb/common/maindb/local.py +1 -1
nucliadb/common/maindb/pg.py +1 -1
nucliadb/common/nidx.py +19 -1
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +3 -3
nucliadb/ingest/consumer/pull.py +7 -0
nucliadb/ingest/consumer/service.py +2 -27
nucliadb/ingest/consumer/shard_creator.py +17 -6
nucliadb/ingest/fields/base.py +9 -17
nucliadb/ingest/fields/conversation.py +47 -1
nucliadb/ingest/orm/brain_v2.py +21 -3
nucliadb/ingest/orm/index_message.py +126 -111
nucliadb/ingest/orm/knowledgebox.py +84 -43
nucliadb/ingest/orm/processor/auditing.py +1 -1
nucliadb/ingest/orm/processor/processor.py +95 -149
nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
nucliadb/ingest/orm/resource.py +10 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/serialize.py +2 -2
nucliadb/ingest/service/writer.py +26 -19
nucliadb/ingest/settings.py +33 -11
nucliadb/learning_proxy.py +12 -15
nucliadb/metrics_exporter.py +17 -4
nucliadb/migrator/datamanager.py +11 -17
nucliadb/migrator/migrator.py +2 -2
nucliadb/purge/__init__.py +12 -17
nucliadb/purge/orphan_shards.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +40 -12
nucliadb/reader/api/v1/learning_config.py +30 -10
nucliadb/reader/api/v1/resource.py +2 -2
nucliadb/reader/api/v1/services.py +1 -1
nucliadb/reader/reader/notifications.py +1 -1
nucliadb/search/api/v1/__init__.py +1 -0
nucliadb/search/api/v1/catalog.py +4 -4
nucliadb/search/api/v1/find.py +1 -4
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/resource/ask.py +21 -1
nucliadb/search/api/v1/search.py +1 -4
nucliadb/search/predict.py +9 -2
nucliadb/search/search/cache.py +1 -20
nucliadb/search/search/chat/ask.py +50 -8
nucliadb/search/search/chat/prompt.py +47 -15
nucliadb/search/search/chat/query.py +8 -1
nucliadb/search/search/fetch.py +1 -1
nucliadb/search/search/find.py +1 -6
nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
nucliadb/search/search/hydrator/fields.py +175 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +307 -0
nucliadb/search/search/hydrator/resources.py +56 -0
nucliadb/search/search/metrics.py +16 -0
nucliadb/search/search/predict_proxy.py +33 -11
nucliadb/search/search/query.py +0 -23
nucliadb/search/search/query_parser/fetcher.py +5 -5
nucliadb/search/search/query_parser/models.py +1 -30
nucliadb/search/search/query_parser/parsers/ask.py +1 -1
nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
nucliadb/search/search/query_parser/parsers/common.py +16 -7
nucliadb/search/search/query_parser/parsers/find.py +0 -11
nucliadb/search/search/query_parser/parsers/graph.py +5 -5
nucliadb/search/search/query_parser/parsers/search.py +0 -11
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
nucliadb/search/search/rerankers.py +1 -1
nucliadb/search/search/summarize.py +1 -1
nucliadb/standalone/run.py +3 -0
nucliadb/tasks/retries.py +4 -4
nucliadb/train/generators/sentence_classifier.py +2 -8
nucliadb/train/generators/utils.py +1 -1
nucliadb/train/nodes.py +4 -4
nucliadb/train/servicer.py +1 -1
nucliadb/train/uploader.py +1 -1
nucliadb/writer/api/v1/field.py +14 -9
nucliadb/writer/api/v1/knowledgebox.py +15 -52
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +2 -2
nucliadb/writer/resource/field.py +38 -2
nucliadb/writer/tus/azure.py +4 -4
nucliadb/writer/tus/gcs.py +11 -17
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0

nucliadb/ingest/service/writer.py CHANGED Viewed

@@ -126,6 +126,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
                 external_index_provider=request.external_index_provider,
                 hidden_resources_enabled=request.hidden_resources_enabled,
                 hidden_resources_hide_on_creation=request.hidden_resources_hide_on_creation,
+                prewarm_enabled=request.prewarm_enabled,
             )
         except KnowledgeBoxConflict:
@@ -167,11 +168,17 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
             )
         try:
-            async with self.driver.transaction() as txn:
-                kbid = await KnowledgeBoxORM.update(
-                    txn, uuid=request.uuid, slug=request.slug, config=request.config
-                )
-                await txn.commit()
+            kbid = await KnowledgeBoxORM.update(
+                self.driver,
+                kbid=request.uuid,
+                slug=request.slug,
+                title=request.config.title or None,
+                description=request.config.description or None,
+                external_index_provider=request.config.external_index_provider or None,
+                hidden_resources_enabled=request.config.hidden_resources_enabled,
+                hidden_resources_hide_on_creation=request.config.hidden_resources_hide_on_creation,
+                prewarm_enabled=request.config.prewarm_enabled,
+            )
         except KnowledgeBoxNotFound:
             return UpdateKnowledgeBoxResponse(status=KnowledgeBoxResponseStatus.NOTFOUND)
         except Exception:
@@ -221,13 +228,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         self, request: NewEntitiesGroupRequest, context=None
     ) -> NewEntitiesGroupResponse:
         response = NewEntitiesGroupResponse()
-        async with self.driver.transaction(read_only=True) as ro_txn:
+        async with self.driver.ro_transaction() as ro_txn:
             kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
             if kbobj is None:
                 response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
                 return response
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             kbobj.txn = txn
             entities_manager = EntitiesManager(kbobj, txn)
             try:
@@ -244,7 +251,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         self, request: GetEntitiesRequest, context=None
     ) -> GetEntitiesResponse:
         response = GetEntitiesResponse()
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             kbobj = await self.proc.get_kb_obj(txn, request.kb)
             if kbobj is None:
                 response.status = GetEntitiesResponse.Status.NOTFOUND
@@ -266,7 +273,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         self, request: ListEntitiesGroupsRequest, context=None
     ) -> ListEntitiesGroupsResponse:
         response = ListEntitiesGroupsResponse()
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             kbobj = await self.proc.get_kb_obj(txn, request.kb)
             if kbobj is None:
                 response.status = ListEntitiesGroupsResponse.Status.NOTFOUND
@@ -290,7 +297,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         self, request: GetEntitiesGroupRequest, context=None
     ) -> GetEntitiesGroupResponse:
         response = GetEntitiesGroupResponse()
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             kbobj = await self.proc.get_kb_obj(txn, request.kb)
             if kbobj is None:
                 response.status = GetEntitiesGroupResponse.Status.KB_NOT_FOUND
@@ -315,13 +322,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
     async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter:  # type: ignore
         response = OpStatusWriter()
-        async with self.driver.transaction(read_only=True) as ro_txn:
+        async with self.driver.ro_transaction() as ro_txn:
             kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
             if kbobj is None:
                 response.status = OpStatusWriter.Status.NOTFOUND
                 return response
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             kbobj.txn = txn
             entities_manager = EntitiesManager(kbobj, txn)
             try:
@@ -339,13 +346,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
         self, request: UpdateEntitiesGroupRequest, context=None
     ) -> UpdateEntitiesGroupResponse:
         response = UpdateEntitiesGroupResponse()
-        async with self.driver.transaction(read_only=True) as ro_txn:
+        async with self.driver.ro_transaction() as ro_txn:
             kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
             if kbobj is None:
                 response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
                 return response
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             kbobj.txn = txn
             entities_manager = EntitiesManager(kbobj, txn)
             try:
@@ -368,13 +375,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
     async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter:  # type: ignore
         response = OpStatusWriter()
-        async with self.driver.transaction(read_only=True) as ro_txn:
+        async with self.driver.ro_transaction() as ro_txn:
             kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
             if kbobj is None:
                 response.status = OpStatusWriter.Status.NOTFOUND
                 return response
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             kbobj.txn = txn
             entities_manager = EntitiesManager(kbobj, txn)
             try:
@@ -393,7 +400,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
     ) -> WriterStatusResponse:
         logger.info("Status Call")
         response = WriterStatusResponse()
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             async for _, slug in datamanagers.kb.get_kbs(txn):
                 response.knowledgeboxes.append(slug)
@@ -405,7 +412,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
             return response
     async def Index(self, request: IndexResource, context=None) -> IndexStatus:  # type: ignore
-        async with self.driver.transaction() as txn:
+        async with self.driver.ro_transaction() as txn:
             kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
             resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
             bm = await generate_broker_message(resobj)
@@ -419,7 +426,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
     async def ReIndex(self, request: IndexResource, context=None) -> IndexStatus:  # type: ignore
         try:
-            async with self.driver.transaction() as txn:
+            async with self.driver.rw_transaction() as txn:
                 kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
                 resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
                 resobj.disable_vectors = not request.reindex_vectors

nucliadb/ingest/settings.py CHANGED Viewed

@@ -67,6 +67,11 @@ class DriverSettings(BaseSettings):
     )
+class CatalogConfig(Enum):
+    UNSET = "unset"
+    PG = "pg"
 # For use during migration from pull v1 to pull v2
 class ProcessingPullMode(Enum):
     OFF = "off"
@@ -75,26 +80,43 @@ class ProcessingPullMode(Enum):
 class Settings(DriverSettings):
-    grpc_port: int = 8030
-    partitions: list[str] = ["1"]
+    # Catalog settings
+    catalog: CatalogConfig = Field(default=CatalogConfig.PG, description="Catalog backend")
+    # Pull worker settings
     pull_time_error_backoff: int = 30
     pull_api_timeout: int = 60
-    disable_pull_worker: bool = False
+    disable_pull_worker: bool = Field(
+        default=False, description="Set to true to disable the pull worker task"
+    )
-    # ingest consumer sts replica settings
-    replica_number: int = -1
-    total_replicas: int = 1  # number of ingest processor replicas in the cluster
-    nuclia_partitions: int = 50
+    # Ingest consumer sts replica settings
+    replica_number: int = Field(
+        default=-1,
+        description="The replica number of this ingest statefulset instance. Leave to -1 to auto-assign based on hostname.",
+    )
+    total_replicas: int = Field(default=1, description="Number of ingest statefulset replicas deployed")
+    nuclia_partitions: int = Field(
+        default=50, description="Total number of partitions of the nats stream."
+    )
+    partitions: list[str] = Field(
+        default=["1"],
+        description="List of partitions assigned to this ingest statefulset instance. This is automatically assigned based on the replica number and total replicas.",
+    )
+    max_concurrent_ingest_processing: int = Field(
+        default=5,
+        description="Controls the number of concurrent messages from different partitions that can be processed at the same time by ingest statefulset consumers.",
+    )
-    max_receive_message_length: int = 500  # In MB
+    # Grpc server settings
+    grpc_port: int = 8030
+    max_receive_message_length: int = Field(
+        default=500, description="Maximum receive grpc message length in MB."
+    )
     # Search query timeouts
     relation_search_timeout: float = 10.0
     relation_types_timeout: float = 10.0
-    max_concurrent_ingest_processing: int = 5
 settings = Settings()

nucliadb/learning_proxy.py CHANGED Viewed

@@ -43,8 +43,8 @@ logger = logging.getLogger(SERVICE_NAME)
 WHITELISTED_HEADERS = {
     "x-nucliadb-user",
     "x-nucliadb-roles",
-    "x-stf-roles",
-    "x-stf-user",
+    "x-nucliadb-account",
+    "x-nucliadb-account-type",
     "x-forwarded-for",
     "x-forwarded-host",
     "x-forwarded-proto",
@@ -204,14 +204,14 @@ async def learning_config_proxy(
     request: Request,
     method: str,
     url: str,
-    extra_headers: Optional[dict[str, str]] = None,
+    headers: dict[str, str] = {},
 ) -> Union[Response, StreamingResponse]:
     return await proxy(
         service=LearningService.CONFIG,
         request=request,
         method=method,
         url=url,
-        extra_headers=extra_headers,
+        headers=headers,
     )
@@ -244,24 +244,21 @@ async def proxy(
     request: Request,
     method: str,
     url: str,
-    extra_headers: Optional[dict[str, str]] = None,
+    headers: dict[str, str] = {},
 ) -> Union[Response, StreamingResponse]:
     """
     Proxy the request to a learning API.
-    service: LearningService. The learning service to proxy the request to.
-    request: Request. The incoming request.
-    method: str. The HTTP method to use.
-    url: str. The URL to proxy the request to.
-    extra_headers: Optional[dict[str, str]]. Extra headers to include in the proxied request.
+    service: The learning service to proxy the request to.
+    request: The incoming request.
+    method: The HTTP method to use.
+    url: The URL to proxy the request to.
+    headers: Extra headers to include in the proxied request.
     Returns: Response. The response from the learning API. If the response is chunked, a StreamingResponse is returned.
     """
-    proxied_headers = extra_headers or {}
-    proxied_headers.update(
-        {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
-    )
+    proxied_headers = {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
+    proxied_headers.update(**headers)
     async with service_client(
         base_url=get_base_url(service=service),

nucliadb/metrics_exporter.py CHANGED Viewed

@@ -37,19 +37,24 @@ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "versi
 PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
+KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
+RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
 async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
     """
     Return a list of all KB ids.
     """
-    async with context.kv_driver.transaction(read_only=True) as txn:
+    async with context.kv_driver.ro_transaction() as txn:
         async for kbid, _ in datamanagers.kb.get_kbs(txn):
             yield kbid
-async def update_migration_metrics(context: ApplicationContext):
+async def update_kb_metrics(context: ApplicationContext):
     """
-    Report the global migration version and the number of KBs per migration version.
+    Report metrics at the kb level:
+      - total number of KBs
+      - the global migration version and the number of KBs per migration version.
     """
     # Clear previoulsy set values so that we report only the current state
     MIGRATION_COUNT.gauge.clear()
@@ -60,12 +65,16 @@ async def update_migration_metrics(context: ApplicationContext):
         MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
     version_count: dict[str, int] = {}
+    n_kbs = 0
     async for kbid in iter_kbids(context):
         kb_info = await mdm.get_kb_info(kbid)
         if kb_info is not None:
             current_version = str(kb_info.current_version)
             version_count.setdefault(current_version, 0)
             version_count[current_version] += 1
+            n_kbs += 1
+    KBS_COUNT.set(n_kbs)
     for version, count in version_count.items():
         MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
@@ -88,6 +97,10 @@ async def update_resource_metrics(context: ApplicationContext):
         count = cast(Tuple[int], await cur.fetchone())[0]
         PENDING_RESOURCE_COUNT.set(count)
+        await cur.execute("SELECT COUNT(*) FROM catalog")
+        count = cast(Tuple[int], await cur.fetchone())[0]
+        RESOURCES_COUNT.set(count)
 async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
     """
@@ -109,7 +122,7 @@ async def run_exporter(context: ApplicationContext):
     # Schedule exporter tasks
     tasks = []
     for export_task, interval in [
-        (update_migration_metrics, 60 * 3),
+        (update_kb_metrics, 60 * 3),
         (update_resource_metrics, 60 * 5),
     ]:
         tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))

nucliadb/migrator/datamanager.py CHANGED Viewed

@@ -48,45 +48,39 @@ class MigrationsDataManager:
     async def schedule_all_kbs(self, target_version: int) -> None:
         # Get all kb ids
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
         # Schedule the migrations
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             for kbid in kbids:
                 await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
             await txn.commit()
     async def get_kb_migrations(self) -> list[str]:
         keys = []
-        async with self.driver.transaction() as txn:
+        async with self.driver.ro_transaction() as txn:
             async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
                 keys.append(key.split("/")[-1])
         return keys
     async def delete_kb_migration(self, *, kbid: str) -> None:
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
             await txn.commit()
     async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
             if kb_config is None:
                 return None
         return KnowledgeBoxInfo(current_version=kb_config.migration_version)
     async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
-        async with self.driver.transaction() as txn:
-            kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
-            if kb_config is None:
-                raise Exception(f"KB {kbid} does not exist")
-            kb_config.migration_version = current_version
-            await KnowledgeBoxORM.update(txn, kbid, config=kb_config)
-            await txn.commit()
+        await KnowledgeBoxORM.update(self.driver, kbid, migration_version=current_version)
     async def get_global_info(self) -> GlobalInfo:
-        async with self.driver.transaction(read_only=True) as txn:
+        async with self.driver.ro_transaction() as txn:
             raw_pb = await txn.get(MIGRATION_INFO_KEY)
         if raw_pb is None:
             return GlobalInfo(current_version=0, target_version=None)
@@ -100,7 +94,7 @@ class MigrationsDataManager:
         current_version: Union[int, _Unset] = _UNSET,
         target_version: Union[int, None, _Unset] = _UNSET,
     ) -> None:
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
             pb = migrations_pb2.MigrationInfo()
             if raw_pb is not None:
@@ -116,18 +110,18 @@ class MigrationsDataManager:
     async def get_kbs_to_rollover(self) -> list[str]:
         keys = []
-        async with self.driver.transaction() as txn:
+        async with self.driver.ro_transaction() as txn:
             async for key in txn.keys(ROLLOVER_CONTAINER_KEY):
                 keys.append(key.split("/")[-1])
         return keys
     async def add_kb_rollover(self, kbid: str) -> None:
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             await txn.set(ROLLOVER_KEY.format(kbid=kbid), b"")
             await txn.commit()
     async def delete_kb_rollover(self, kbid: str) -> None:
-        async with self.driver.transaction() as txn:
+        async with self.driver.rw_transaction() as txn:
             await txn.delete(ROLLOVER_KEY.format(kbid=kbid))
             await txn.commit()

nucliadb/migrator/migrator.py CHANGED Viewed

@@ -211,7 +211,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
     # The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
     # without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
-    async with driver.transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock:  # type: ignore[attr-defined]
+    async with driver.rw_transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock:  # type: ignore[attr-defined]
         await cur_lock.execute(
             "CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
         )
@@ -227,7 +227,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
             # Gets a new transaction for each migration, so if they get interrupted we at least
             # save the state of the last finished transaction
-            async with driver.transaction() as tx, tx.connection.cursor() as cur:  # type: ignore[attr-defined]
+            async with driver.rw_transaction() as tx, tx.connection.cursor() as cur:  # type: ignore[attr-defined]
                 await migration.migrate(tx)
                 await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
                 await tx.commit()

nucliadb/purge/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@
 #
 import asyncio
 import importlib.metadata
+from itertools import batched  # type: ignore
 from typing import AsyncGenerator
 from nucliadb.common import datamanagers
@@ -47,7 +48,7 @@ from nucliadb_utils.utilities import get_storage
 async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
-    async with driver.transaction(read_only=True) as keys_txn:
+    async with driver.ro_transaction() as keys_txn:
         async for key in keys_txn.keys(match=match):
             yield key
@@ -81,9 +82,9 @@ async def purge_kb(driver: Driver):
             )
             continue
-        # Now delete the tikv delete mark
+        # Now delete the delete mark
         try:
-            async with driver.transaction() as txn:
+            async with driver.rw_transaction() as txn:
                 key_to_purge = KB_TO_DELETE.format(kbid=kbid)
                 await txn.delete(key_to_purge)
                 await txn.commit()
@@ -126,7 +127,7 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
         if delete_marker:
             try:
-                async with driver.transaction() as txn:
+                async with driver.rw_transaction() as txn:
                     await txn.delete(key)
                     await txn.commit()
                 logger.info(f"  √ Deleted storage deletion marker {key}")
@@ -163,7 +164,7 @@ async def _count_resources_storage_to_purge(driver: Driver) -> int:
     """
     Count the number of resources marked as deleted in storage.
     """
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
@@ -174,7 +175,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
     """
     # Get the keys of the resources to delete in batches of 100
     to_delete_batch = []
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
             to_delete_batch.append(key)
@@ -194,7 +195,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
     await asyncio.gather(*tasks)
     # Delete the schedule-to-delete keys
-    async with driver.transaction() as txn:
+    async with driver.rw_transaction() as txn:
         for key in to_delete_batch:
             await txn.delete(key)
         await txn.commit()
@@ -220,20 +221,20 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
             continue
         try:
-            async with driver.transaction(read_only=True) as txn:
+            async with driver.ro_transaction() as txn:
                 value = await txn.get(key)
                 assert value is not None, "Key must exist or we wouldn't had fetch it iterating keys"
                 purge_payload = VectorSetPurge()
                 purge_payload.ParseFromString(value)
             fields: list[Field] = []
-            async with driver.transaction(read_only=True) as txn:
+            async with driver.ro_transaction() as txn:
                 kb = KnowledgeBox(txn, storage, kbid)
                 async for resource in kb.iterate_resources():
                     fields.extend((await resource.get_fields(force=True)).values())
             logger.info(f"Purging {len(fields)} fields for vectorset {vectorset}", extra={"kbid": kbid})
-            for fields_batch in batchify(fields, 20):
+            for fields_batch in batched(fields, n=20):
                 tasks = []
                 for field in fields_batch:
                     if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
@@ -259,7 +260,7 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
                 await asyncio.gather(*tasks)
             # Finally, delete the key
-            async with driver.transaction() as txn:
+            async with driver.rw_transaction() as txn:
                 await txn.delete(key)
                 await txn.commit()
@@ -317,9 +318,3 @@ def run() -> int:  # pragma: no cover
     setup_logging()
     errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
     return asyncio.run(main())
-def batchify(iterable, n=1):
-    """Yield successive n-sized chunks from iterable."""
-    for i in range(0, len(iterable), n):
-        yield iterable[i : i + n]

nucliadb/purge/orphan_shards.py CHANGED Viewed

@@ -100,7 +100,7 @@ async def _get_indexed_shards() -> dict[str, ShardKb]:
 async def _get_stored_shards(driver: Driver) -> dict[str, ShardKb]:
     stored_shards: dict[str, ShardKb] = {}
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         async for kbid, _ in datamanagers.kb.get_kbs(txn):
             kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
             if kb_shards is None:
@@ -135,7 +135,7 @@ async def _get_kbid(shard_id: str) -> Optional[str]:
 async def report_orphan_shards(driver: Driver):
     orphan_shards = await detect_orphan_shards(driver)
     logger.info(f"Found {len(orphan_shards)} orphan shards")
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         for shard_id, kbid in orphan_shards.items():
             if kbid == UNKNOWN_KB:
                 msg = "Found orphan shard but could not get KB info"

nucliadb/reader/api/v1/knowledgebox.py CHANGED Viewed

@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
-from fastapi import HTTPException
+from fastapi import Header, HTTPException
 from fastapi_versioning import version
 from starlette.requests import Request
@@ -44,12 +44,20 @@ from nucliadb_utils.authentication import requires, requires_one
 )
 @requires(NucliaDBRoles.MANAGER)
 @version(1)
-async def get_kbs(request: Request, prefix: str = "") -> KnowledgeBoxList:
+async def get_kbs(
+    request: Request,
+    prefix: str = "",
+    x_nucliadb_account: str = Header(default="", include_in_schema=False),
+) -> KnowledgeBoxList:
     driver = get_driver()
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         response = KnowledgeBoxList()
         async for kbid, slug in datamanagers.kb.get_kbs(txn, prefix=prefix):
-            response.kbs.append(KnowledgeBoxObjSummary(slug=slug or None, uuid=kbid))
+            response.kbs.append(
+                KnowledgeBoxObjSummary(
+                    slug=user_kb_slug(slug, account_id=x_nucliadb_account) or None, uuid=kbid
+                )
+            )
         return response
@@ -62,16 +70,18 @@ async def get_kbs(request: Request, prefix: str = "") -> KnowledgeBoxList:
 )
 @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
 @version(1)
-async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
+async def get_kb(
+    request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
+) -> KnowledgeBoxObj:
     driver = get_driver()
-    async with driver.transaction(read_only=True) as txn:
+    async with driver.ro_transaction() as txn:
         kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
         if kb_config is None:
             raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
         return KnowledgeBoxObj(
             uuid=kbid,
-            slug=kb_config.slug,
+            slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
             config=from_proto.knowledgebox_config(kb_config),
         )
@@ -85,12 +95,18 @@ async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
 )
 @requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
 @version(1)
-async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
+async def get_kb_by_slug(
+    request: Request, slug: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
+) -> KnowledgeBoxObj:
     driver = get_driver()
-    async with driver.transaction(read_only=True) as txn:
-        kbid = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
+    async with driver.ro_transaction() as txn:
+        # For cloud, the account id is prepended in order to be able to reuse the same slug in different accounts.
+        kbid = await datamanagers.kb.get_kb_uuid(txn, slug=f"{x_nucliadb_account}:{slug}")
         if kbid is None:
-            raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
+            # For onprem, the slug is fully controlled by the user
+            kbid = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
+            if kbid is None:
+                raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
         kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
         if kb_config is None:
@@ -98,6 +114,18 @@ async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
         return KnowledgeBoxObj(
             uuid=kbid,
-            slug=kb_config.slug,
+            slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
             config=from_proto.knowledgebox_config(kb_config),
         )
+def user_kb_slug(stored_slug: str, account_id: str) -> str:
+    if account_id != "":
+        # On cloud deployments, backend prepends the account id to the user-defined slug.
+        # This is required to make kb slugs reused across different accounts using the same nucliadb.
+        # We strip it so the user does not see it.
+        return stored_slug.split(f"{account_id}:")[-1]
+    else:
+        # On on-prem deployments, the account_id is set to "" by default and we don't need to strip
+        # anything as the backend is not invovled in the kb creation process.
+        return stored_slug

nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

Potentially problematic release.

nucliadb 6.7.2.post4862py3-none-any.whl → 6.9.2.post5282py3-none-any.whl