nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1.post2701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,7 +49,6 @@ from nucliadb.ingest.orm.exceptions import (
49
49
  from nucliadb.ingest.orm.metrics import processor_observer
50
50
  from nucliadb.ingest.orm.resource import Resource
51
51
  from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
52
- from nucliadb.ingest.settings import settings
53
52
  from nucliadb.migrator.utils import get_latest_version
54
53
  from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
55
54
  from nucliadb_protos.knowledgebox_pb2 import (
@@ -60,6 +59,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
60
59
  StoredExternalIndexProviderMetadata,
61
60
  )
62
61
  from nucliadb_protos.resources_pb2 import Basic
62
+ from nucliadb_utils.settings import is_onprem_nucliadb
63
63
  from nucliadb_utils.storages.storage import Storage
64
64
  from nucliadb_utils.utilities import (
65
65
  get_audit,
@@ -74,6 +74,9 @@ KB_KEYS = "/kbs/{kbid}/"
74
74
  KB_TO_DELETE_BASE = "/kbtodelete/"
75
75
  KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
76
76
 
77
+ RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
78
+ RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
79
+
77
80
  KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
78
81
  KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
79
82
 
@@ -415,9 +418,16 @@ class KnowledgeBox:
415
418
  logger.exception("Error deleting slug")
416
419
 
417
420
  async def storage_delete_resource(self, uuid: str):
418
- await self.storage.delete_resource(
419
- self.kbid, uuid, max_parallel=settings.ingest_delete_resource_storage_max_parallel
420
- )
421
+ if is_onprem_nucliadb():
422
+ await self.storage.delete_resource(self.kbid, uuid)
423
+ else:
424
+ # Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
425
+ # will take care of it
426
+ await self.schedule_delete_resource(self.kbid, uuid)
427
+
428
+ async def schedule_delete_resource(self, kbid: str, uuid: str):
429
+ key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
430
+ await self.txn.set(key, b"")
421
431
 
422
432
  async def delete_resource(self, uuid: str):
423
433
  with processor_observer({"type": "delete_resource_maindb"}):
@@ -85,8 +85,5 @@ class Settings(DriverSettings):
85
85
 
86
86
  max_concurrent_ingest_processing: int = 5
87
87
 
88
- # Ingest processor settings
89
- ingest_delete_resource_storage_max_parallel: int = 20
90
-
91
88
 
92
89
  settings = Settings()
@@ -32,6 +32,7 @@ from nucliadb.ingest.orm.knowledgebox import (
32
32
  KB_TO_DELETE_STORAGE_BASE,
33
33
  KB_VECTORSET_TO_DELETE,
34
34
  KB_VECTORSET_TO_DELETE_BASE,
35
+ RESOURCE_TO_DELETE_STORAGE_BASE,
35
36
  KnowledgeBox,
36
37
  )
37
38
  from nucliadb_telemetry import errors
@@ -131,6 +132,67 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
131
132
  logger.info("FINISH PURGING KB STORAGE")
132
133
 
133
134
 
135
+ async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
136
+ """
137
+ Remove from storage all resources marked as deleted.
138
+
139
+ Returns the number of resources purged.
140
+ """
141
+ logger.info("Starting purge of deleted resource storage")
142
+ to_purge = await _count_resources_storage_to_purge(driver)
143
+ logger.info(f"Found {to_purge} resources to purge")
144
+ while True:
145
+ try:
146
+ purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
147
+ if not purged:
148
+ logger.info("No more resources to purge found")
149
+ return
150
+ logger.info(f"Purged {purged} resources")
151
+
152
+ except asyncio.CancelledError:
153
+ logger.info("Purge of deleted resource storage was cancelled")
154
+ return
155
+
156
+
157
+ async def _count_resources_storage_to_purge(driver: Driver) -> int:
158
+ """
159
+ Count the number of resources marked as deleted in storage.
160
+ """
161
+ async with driver.transaction(read_only=True) as txn:
162
+ return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
163
+
164
+
165
+ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
166
+ """
167
+ Remove from storage a batch of resources marked as deleted. Returns the
168
+ number of resources purged.
169
+ """
170
+ # Get the keys of the resources to delete in batches of 100
171
+ to_delete_batch = []
172
+ async with driver.transaction(read_only=True) as txn:
173
+ async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
174
+ to_delete_batch.append(key)
175
+
176
+ if not to_delete_batch:
177
+ return 0
178
+
179
+ # Delete the resources blobs from storage
180
+ logger.info(f"Purging {len(to_delete_batch)} deleted resources")
181
+ tasks = []
182
+ for key in to_delete_batch:
183
+ kbid, resource_id = key.split("/")[-2:]
184
+ tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
185
+ await asyncio.gather(*tasks)
186
+
187
+ # Delete the schedule-to-delete keys
188
+ async with driver.transaction() as txn:
189
+ for key in to_delete_batch:
190
+ await txn.delete(key)
191
+ await txn.commit()
192
+
193
+ return len(to_delete_batch)
194
+
195
+
134
196
  async def purge_kb_vectorsets(driver: Driver, storage: Storage):
135
197
  """Vectors for a vectorset are stored in a key inside each resource. Iterate
136
198
  through all resources of the KB and remove any storage object containing
@@ -186,14 +248,19 @@ async def main():
186
248
  service_name=SERVICE_NAME,
187
249
  )
188
250
  try:
251
+ purge_resources_storage_task = asyncio.create_task(
252
+ purge_deleted_resource_storage(driver, storage)
253
+ )
189
254
  await purge_kb(driver)
190
255
  await purge_kb_storage(driver, storage)
191
256
  await purge_kb_vectorsets(driver, storage)
257
+ await purge_resources_storage_task
192
258
  except Exception as ex: # pragma: no cover
193
259
  logger.exception("Unhandled exception on purge command")
194
260
  errors.capture_exception(ex)
195
261
  finally:
196
262
  try:
263
+ purge_resources_storage_task.cancel()
197
264
  await storage.finalize()
198
265
  await teardown_driver()
199
266
  await teardown_cluster()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nucliadb
3
- Version: 6.2.0.post2679
3
+ Version: 6.2.1.post2701
4
4
  Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
5
5
  Author: NucliaDB Community
6
6
  Author-email: nucliadb@nuclia.com
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
22
22
  Classifier: Programming Language :: Python :: 3 :: Only
23
23
  Requires-Python: >=3.9, <4
24
24
  Description-Content-Type: text/markdown
25
- Requires-Dist: nucliadb-telemetry[all]>=6.2.0.post2679
26
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.0.post2679
27
- Requires-Dist: nucliadb-protos>=6.2.0.post2679
28
- Requires-Dist: nucliadb-models>=6.2.0.post2679
25
+ Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post2701
26
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post2701
27
+ Requires-Dist: nucliadb-protos>=6.2.1.post2701
28
+ Requires-Dist: nucliadb-models>=6.2.1.post2701
29
29
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
30
30
  Requires-Dist: nucliadb-node-binding>=2.26.0
31
31
  Requires-Dist: nuclia-models>=0.24.2
@@ -111,7 +111,7 @@ nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE
111
111
  nucliadb/ingest/processing.py,sha256=x8FGnq2epsGl0QEzdYlgCys9MpxtV5_WO09hc7Wy150,20254
112
112
  nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
113
  nucliadb/ingest/serialize.py,sha256=GSDfrO4JLm-QLKw8LJ7TD1JFcXXvwm-ugXzbCfGh3Fk,15492
114
- nucliadb/ingest/settings.py,sha256=SDQpMRsTsNyi6IDxCJy6BZVUSKUzwAMuxf6ktp31VMM,3130
114
+ nucliadb/ingest/settings.py,sha256=AFeqM2Sq2lHCrW745S4VjUmxfwkB2H9T_bncVRvLNRA,3039
115
115
  nucliadb/ingest/utils.py,sha256=l1myURu3r8oA11dx3GpHw-gNTUc1AFX8xdPm9Lgl2rA,2275
116
116
  nucliadb/ingest/consumer/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
117
117
  nucliadb/ingest/consumer/auditing.py,sha256=EJoqRRr4dk2eUMK0GOY6b9xHO0YLQ0LjoP_xZBLACZo,7280
@@ -135,7 +135,7 @@ nucliadb/ingest/orm/brain.py,sha256=TtHPKZzv_Yz-tYkB1QjzPWUxA7Z_naMVXAfo8iov9Gw,
135
135
  nucliadb/ingest/orm/broker_message.py,sha256=JYYUJIZEL_EqovQuw6u-FmEkjyoYlxIXJq9hFekOiks,6441
136
136
  nucliadb/ingest/orm/entities.py,sha256=2PslT1FZ6yCvJtjR0UpKTSzxJrtS-C_gZx4ZTWHunTc,15759
137
137
  nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
138
- nucliadb/ingest/orm/knowledgebox.py,sha256=loH4PBxL2aJYMpbumrYpWq1kzqt-HNUeAok2-kOdqz8,22736
138
+ nucliadb/ingest/orm/knowledgebox.py,sha256=dBetjoJBYT6JuGmMHiqjcfJeD8qJrK3MQt9X03IrHRA,23228
139
139
  nucliadb/ingest/orm/metrics.py,sha256=OkwMSPKLZcKba0ZTwtTiIxwBgaLMX5ydhGieKvi2y7E,1096
140
140
  nucliadb/ingest/orm/resource.py,sha256=_m4B14dSpO-lszpoqlhXYL3LrplB9p3NrDZC5kQbXHs,53860
141
141
  nucliadb/ingest/orm/utils.py,sha256=vCe_9UxHu26JDFGLwQ0wH-XyzJIpQCTK-Ow9dtZR5Vg,2716
@@ -160,7 +160,7 @@ nucliadb/migrator/settings.py,sha256=jOUX0ZMunCXN8HpF9xXN0aunJYRhu4Vdr_ffjRIqwtw
160
160
  nucliadb/migrator/utils.py,sha256=NgUreUvON8_nWEzTxELBMWlfV7E6-6qi-g0DMEbVEz4,2885
161
161
  nucliadb/models/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
162
162
  nucliadb/models/responses.py,sha256=qnuOoc7TrVSUnpikfTwHLKez47_DE4mSFzpxrwtqijA,1599
163
- nucliadb/purge/__init__.py,sha256=Y_PcRfvqccUJwXSFZO4Q9uogBGe1_pH4MyS8RvyCPgA,7941
163
+ nucliadb/purge/__init__.py,sha256=tcXwO99714cqflLVJyZzOv6_64H9pt7r6V0UogDd4oA,10389
164
164
  nucliadb/purge/orphan_shards.py,sha256=fA5yqRRN-M50OIk8dkAi1_ShFVjwDYEYqzMA9dYP0eU,9227
165
165
  nucliadb/reader/__init__.py,sha256=C5Efic7WlGm2U2C5WOyquMFbIj2Pojwe_8mwzVYnOzE,1304
166
166
  nucliadb/reader/app.py,sha256=Se-BFTE6d1v1msLzQn4q5XIhjnSxa2ckDSHdvm7NRf8,3096
@@ -332,9 +332,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
332
332
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
333
333
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
334
334
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
335
- nucliadb-6.2.0.post2679.dist-info/METADATA,sha256=Ys9PJ1dTfL7VBWnMzfhtgnsPA9GbhaJUTi5EcSwivnU,4429
336
- nucliadb-6.2.0.post2679.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
337
- nucliadb-6.2.0.post2679.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
338
- nucliadb-6.2.0.post2679.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
339
- nucliadb-6.2.0.post2679.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
340
- nucliadb-6.2.0.post2679.dist-info/RECORD,,
335
+ nucliadb-6.2.1.post2701.dist-info/METADATA,sha256=hdFV2cB9AAEk1AfUWZeiT_VI91OuRsNXvEe0fSMVN5M,4429
336
+ nucliadb-6.2.1.post2701.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
337
+ nucliadb-6.2.1.post2701.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
338
+ nucliadb-6.2.1.post2701.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
339
+ nucliadb-6.2.1.post2701.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
340
+ nucliadb-6.2.1.post2701.dist-info/RECORD,,