nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1.post2701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/ingest/orm/knowledgebox.py +14 -4
- nucliadb/ingest/settings.py +0 -3
- nucliadb/purge/__init__.py +67 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/METADATA +5 -5
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/RECORD +9 -9
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.post2701.dist-info}/zip-safe +0 -0
@@ -49,7 +49,6 @@ from nucliadb.ingest.orm.exceptions import (
|
|
49
49
|
from nucliadb.ingest.orm.metrics import processor_observer
|
50
50
|
from nucliadb.ingest.orm.resource import Resource
|
51
51
|
from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
|
52
|
-
from nucliadb.ingest.settings import settings
|
53
52
|
from nucliadb.migrator.utils import get_latest_version
|
54
53
|
from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
|
55
54
|
from nucliadb_protos.knowledgebox_pb2 import (
|
@@ -60,6 +59,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
60
59
|
StoredExternalIndexProviderMetadata,
|
61
60
|
)
|
62
61
|
from nucliadb_protos.resources_pb2 import Basic
|
62
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
63
63
|
from nucliadb_utils.storages.storage import Storage
|
64
64
|
from nucliadb_utils.utilities import (
|
65
65
|
get_audit,
|
@@ -74,6 +74,9 @@ KB_KEYS = "/kbs/{kbid}/"
|
|
74
74
|
KB_TO_DELETE_BASE = "/kbtodelete/"
|
75
75
|
KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
|
76
76
|
|
77
|
+
RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
|
78
|
+
RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
|
79
|
+
|
77
80
|
KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
|
78
81
|
KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
|
79
82
|
|
@@ -415,9 +418,16 @@ class KnowledgeBox:
|
|
415
418
|
logger.exception("Error deleting slug")
|
416
419
|
|
417
420
|
async def storage_delete_resource(self, uuid: str):
|
418
|
-
|
419
|
-
self.kbid, uuid
|
420
|
-
|
421
|
+
if is_onprem_nucliadb():
|
422
|
+
await self.storage.delete_resource(self.kbid, uuid)
|
423
|
+
else:
|
424
|
+
# Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
|
425
|
+
# will take care of it
|
426
|
+
await self.schedule_delete_resource(self.kbid, uuid)
|
427
|
+
|
428
|
+
async def schedule_delete_resource(self, kbid: str, uuid: str):
|
429
|
+
key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
|
430
|
+
await self.txn.set(key, b"")
|
421
431
|
|
422
432
|
async def delete_resource(self, uuid: str):
|
423
433
|
with processor_observer({"type": "delete_resource_maindb"}):
|
nucliadb/ingest/settings.py
CHANGED
nucliadb/purge/__init__.py
CHANGED
@@ -32,6 +32,7 @@ from nucliadb.ingest.orm.knowledgebox import (
|
|
32
32
|
KB_TO_DELETE_STORAGE_BASE,
|
33
33
|
KB_VECTORSET_TO_DELETE,
|
34
34
|
KB_VECTORSET_TO_DELETE_BASE,
|
35
|
+
RESOURCE_TO_DELETE_STORAGE_BASE,
|
35
36
|
KnowledgeBox,
|
36
37
|
)
|
37
38
|
from nucliadb_telemetry import errors
|
@@ -131,6 +132,67 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
131
132
|
logger.info("FINISH PURGING KB STORAGE")
|
132
133
|
|
133
134
|
|
135
|
+
async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
|
136
|
+
"""
|
137
|
+
Remove from storage all resources marked as deleted.
|
138
|
+
|
139
|
+
Returns the number of resources purged.
|
140
|
+
"""
|
141
|
+
logger.info("Starting purge of deleted resource storage")
|
142
|
+
to_purge = await _count_resources_storage_to_purge(driver)
|
143
|
+
logger.info(f"Found {to_purge} resources to purge")
|
144
|
+
while True:
|
145
|
+
try:
|
146
|
+
purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
|
147
|
+
if not purged:
|
148
|
+
logger.info("No more resources to purge found")
|
149
|
+
return
|
150
|
+
logger.info(f"Purged {purged} resources")
|
151
|
+
|
152
|
+
except asyncio.CancelledError:
|
153
|
+
logger.info("Purge of deleted resource storage was cancelled")
|
154
|
+
return
|
155
|
+
|
156
|
+
|
157
|
+
async def _count_resources_storage_to_purge(driver: Driver) -> int:
|
158
|
+
"""
|
159
|
+
Count the number of resources marked as deleted in storage.
|
160
|
+
"""
|
161
|
+
async with driver.transaction(read_only=True) as txn:
|
162
|
+
return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
|
163
|
+
|
164
|
+
|
165
|
+
async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
|
166
|
+
"""
|
167
|
+
Remove from storage a batch of resources marked as deleted. Returns the
|
168
|
+
number of resources purged.
|
169
|
+
"""
|
170
|
+
# Get the keys of the resources to delete in batches of 100
|
171
|
+
to_delete_batch = []
|
172
|
+
async with driver.transaction(read_only=True) as txn:
|
173
|
+
async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
|
174
|
+
to_delete_batch.append(key)
|
175
|
+
|
176
|
+
if not to_delete_batch:
|
177
|
+
return 0
|
178
|
+
|
179
|
+
# Delete the resources blobs from storage
|
180
|
+
logger.info(f"Purging {len(to_delete_batch)} deleted resources")
|
181
|
+
tasks = []
|
182
|
+
for key in to_delete_batch:
|
183
|
+
kbid, resource_id = key.split("/")[-2:]
|
184
|
+
tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
|
185
|
+
await asyncio.gather(*tasks)
|
186
|
+
|
187
|
+
# Delete the schedule-to-delete keys
|
188
|
+
async with driver.transaction() as txn:
|
189
|
+
for key in to_delete_batch:
|
190
|
+
await txn.delete(key)
|
191
|
+
await txn.commit()
|
192
|
+
|
193
|
+
return len(to_delete_batch)
|
194
|
+
|
195
|
+
|
134
196
|
async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
135
197
|
"""Vectors for a vectorset are stored in a key inside each resource. Iterate
|
136
198
|
through all resources of the KB and remove any storage object containing
|
@@ -186,14 +248,19 @@ async def main():
|
|
186
248
|
service_name=SERVICE_NAME,
|
187
249
|
)
|
188
250
|
try:
|
251
|
+
purge_resources_storage_task = asyncio.create_task(
|
252
|
+
purge_deleted_resource_storage(driver, storage)
|
253
|
+
)
|
189
254
|
await purge_kb(driver)
|
190
255
|
await purge_kb_storage(driver, storage)
|
191
256
|
await purge_kb_vectorsets(driver, storage)
|
257
|
+
await purge_resources_storage_task
|
192
258
|
except Exception as ex: # pragma: no cover
|
193
259
|
logger.exception("Unhandled exception on purge command")
|
194
260
|
errors.capture_exception(ex)
|
195
261
|
finally:
|
196
262
|
try:
|
263
|
+
purge_resources_storage_task.cancel()
|
197
264
|
await storage.finalize()
|
198
265
|
await teardown_driver()
|
199
266
|
await teardown_cluster()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.2.
|
3
|
+
Version: 6.2.1.post2701
|
4
4
|
Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
|
5
5
|
Author: NucliaDB Community
|
6
6
|
Author-email: nucliadb@nuclia.com
|
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
22
22
|
Classifier: Programming Language :: Python :: 3 :: Only
|
23
23
|
Requires-Python: >=3.9, <4
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.2.
|
26
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.
|
27
|
-
Requires-Dist: nucliadb-protos>=6.2.
|
28
|
-
Requires-Dist: nucliadb-models>=6.2.
|
25
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post2701
|
26
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post2701
|
27
|
+
Requires-Dist: nucliadb-protos>=6.2.1.post2701
|
28
|
+
Requires-Dist: nucliadb-models>=6.2.1.post2701
|
29
29
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
30
30
|
Requires-Dist: nucliadb-node-binding>=2.26.0
|
31
31
|
Requires-Dist: nuclia-models>=0.24.2
|
@@ -111,7 +111,7 @@ nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE
|
|
111
111
|
nucliadb/ingest/processing.py,sha256=x8FGnq2epsGl0QEzdYlgCys9MpxtV5_WO09hc7Wy150,20254
|
112
112
|
nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
113
113
|
nucliadb/ingest/serialize.py,sha256=GSDfrO4JLm-QLKw8LJ7TD1JFcXXvwm-ugXzbCfGh3Fk,15492
|
114
|
-
nucliadb/ingest/settings.py,sha256=
|
114
|
+
nucliadb/ingest/settings.py,sha256=AFeqM2Sq2lHCrW745S4VjUmxfwkB2H9T_bncVRvLNRA,3039
|
115
115
|
nucliadb/ingest/utils.py,sha256=l1myURu3r8oA11dx3GpHw-gNTUc1AFX8xdPm9Lgl2rA,2275
|
116
116
|
nucliadb/ingest/consumer/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
117
117
|
nucliadb/ingest/consumer/auditing.py,sha256=EJoqRRr4dk2eUMK0GOY6b9xHO0YLQ0LjoP_xZBLACZo,7280
|
@@ -135,7 +135,7 @@ nucliadb/ingest/orm/brain.py,sha256=TtHPKZzv_Yz-tYkB1QjzPWUxA7Z_naMVXAfo8iov9Gw,
|
|
135
135
|
nucliadb/ingest/orm/broker_message.py,sha256=JYYUJIZEL_EqovQuw6u-FmEkjyoYlxIXJq9hFekOiks,6441
|
136
136
|
nucliadb/ingest/orm/entities.py,sha256=2PslT1FZ6yCvJtjR0UpKTSzxJrtS-C_gZx4ZTWHunTc,15759
|
137
137
|
nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
|
138
|
-
nucliadb/ingest/orm/knowledgebox.py,sha256=
|
138
|
+
nucliadb/ingest/orm/knowledgebox.py,sha256=dBetjoJBYT6JuGmMHiqjcfJeD8qJrK3MQt9X03IrHRA,23228
|
139
139
|
nucliadb/ingest/orm/metrics.py,sha256=OkwMSPKLZcKba0ZTwtTiIxwBgaLMX5ydhGieKvi2y7E,1096
|
140
140
|
nucliadb/ingest/orm/resource.py,sha256=_m4B14dSpO-lszpoqlhXYL3LrplB9p3NrDZC5kQbXHs,53860
|
141
141
|
nucliadb/ingest/orm/utils.py,sha256=vCe_9UxHu26JDFGLwQ0wH-XyzJIpQCTK-Ow9dtZR5Vg,2716
|
@@ -160,7 +160,7 @@ nucliadb/migrator/settings.py,sha256=jOUX0ZMunCXN8HpF9xXN0aunJYRhu4Vdr_ffjRIqwtw
|
|
160
160
|
nucliadb/migrator/utils.py,sha256=NgUreUvON8_nWEzTxELBMWlfV7E6-6qi-g0DMEbVEz4,2885
|
161
161
|
nucliadb/models/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
162
162
|
nucliadb/models/responses.py,sha256=qnuOoc7TrVSUnpikfTwHLKez47_DE4mSFzpxrwtqijA,1599
|
163
|
-
nucliadb/purge/__init__.py,sha256=
|
163
|
+
nucliadb/purge/__init__.py,sha256=tcXwO99714cqflLVJyZzOv6_64H9pt7r6V0UogDd4oA,10389
|
164
164
|
nucliadb/purge/orphan_shards.py,sha256=fA5yqRRN-M50OIk8dkAi1_ShFVjwDYEYqzMA9dYP0eU,9227
|
165
165
|
nucliadb/reader/__init__.py,sha256=C5Efic7WlGm2U2C5WOyquMFbIj2Pojwe_8mwzVYnOzE,1304
|
166
166
|
nucliadb/reader/app.py,sha256=Se-BFTE6d1v1msLzQn4q5XIhjnSxa2ckDSHdvm7NRf8,3096
|
@@ -332,9 +332,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
332
332
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
333
333
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
334
334
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
335
|
-
nucliadb-6.2.
|
336
|
-
nucliadb-6.2.
|
337
|
-
nucliadb-6.2.
|
338
|
-
nucliadb-6.2.
|
339
|
-
nucliadb-6.2.
|
340
|
-
nucliadb-6.2.
|
335
|
+
nucliadb-6.2.1.post2701.dist-info/METADATA,sha256=hdFV2cB9AAEk1AfUWZeiT_VI91OuRsNXvEe0fSMVN5M,4429
|
336
|
+
nucliadb-6.2.1.post2701.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
337
|
+
nucliadb-6.2.1.post2701.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
338
|
+
nucliadb-6.2.1.post2701.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
339
|
+
nucliadb-6.2.1.post2701.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
340
|
+
nucliadb-6.2.1.post2701.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|