nucliadb 6.3.1.post3526__py3-none-any.whl → 6.3.1.post3544__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
@@ -0,0 +1,49 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ class MaindbKeys:
23
+ METADATA = "kbs/{kbid}/backups/{backup_id}"
24
+ LAST_RESTORED = "kbs/{kbid}/backup/{backup_id}/last_restored"
25
+
26
+
27
+ class StorageKeys:
28
+ """
29
+ Defines the key templates used to store backup files in the backups bucket of the storage.
30
+ """
31
+
32
+ BACKUP_PREFIX = "backups/{backup_id}/"
33
+ RESOURCES_PREFIX = "backups/{backup_id}/resources/"
34
+ RESOURCE = "backups/{backup_id}/resources/{resource_id}.tar"
35
+ ENTITIES = "backups/{backup_id}/entities.pb"
36
+ LABELS = "backups/{backup_id}/labels.pb"
37
+
38
+
39
+ class BackupFinishedStream:
40
+ name = "backups"
41
+ subject = "backups.creation_finished"
42
+
43
+
44
+ class BackupsNatsStream:
45
+ name = "ndb-backups"
46
+ stream_subjects = ["ndb-backups.>"]
47
+ create_subject = "ndb-backups.create"
48
+ delete_subject = "ndb-backups.delete"
49
+ restore_subject = "ndb-backups.restore"
@@ -0,0 +1,277 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ import tarfile
22
+ from datetime import datetime, timezone
23
+ from typing import AsyncIterator, Optional
24
+
25
+ from nucliadb.backups.const import (
26
+ BackupFinishedStream,
27
+ MaindbKeys,
28
+ StorageKeys,
29
+ )
30
+ from nucliadb.backups.models import BackupMetadata, CreateBackupRequest
31
+ from nucliadb.backups.settings import settings
32
+ from nucliadb.common import datamanagers
33
+ from nucliadb.common.context import ApplicationContext
34
+ from nucliadb.export_import.utils import (
35
+ download_binary,
36
+ get_broker_message,
37
+ get_cloud_files,
38
+ get_entities,
39
+ get_labels,
40
+ )
41
+ from nucliadb.tasks.retries import TaskRetryHandler
42
+ from nucliadb_protos import backups_pb2, resources_pb2, writer_pb2
43
+ from nucliadb_utils.audit.stream import StreamAuditStorage
44
+ from nucliadb_utils.storages.storage import StorageField
45
+ from nucliadb_utils.utilities import get_audit
46
+
47
+
48
+ async def backup_kb_task(context: ApplicationContext, msg: CreateBackupRequest):
49
+ kbid = msg.kbid
50
+ backup_id = msg.backup_id
51
+
52
+ retry_handler = TaskRetryHandler(
53
+ kbid=kbid,
54
+ task_type="backup",
55
+ task_id=backup_id,
56
+ context=context,
57
+ max_retries=5,
58
+ )
59
+
60
+ @retry_handler.wrap
61
+ async def _backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
62
+ await backup_kb(context, kbid, backup_id)
63
+
64
+ await _backup_kb(context, kbid, backup_id)
65
+
66
+
67
+ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
68
+ """
69
+ Backs up a KB to the cloud storage.
70
+ """
71
+ await backup_resources(context, kbid, backup_id)
72
+ await backup_labels(context, kbid, backup_id)
73
+ await backup_entities(context, kbid, backup_id)
74
+ await notify_backup_completed(context, kbid, backup_id)
75
+ await delete_metadata(context, kbid, backup_id)
76
+
77
+
78
+ async def backup_resources(context: ApplicationContext, kbid: str, backup_id: str):
79
+ metadata = await get_metadata(context, kbid, backup_id)
80
+ if metadata is None:
81
+ metadata = BackupMetadata(
82
+ kbid=kbid,
83
+ requested_at=datetime.now(tz=timezone.utc),
84
+ )
85
+ async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
86
+ metadata.total_resources += 1
87
+ metadata.missing_resources.append(rid)
88
+ metadata.missing_resources.sort()
89
+ await set_metadata(context, kbid, backup_id, metadata)
90
+ tasks = []
91
+ backing_up = []
92
+ for rid in metadata.missing_resources:
93
+ tasks.append(asyncio.create_task(backup_resource(context, backup_id, kbid, rid)))
94
+ backing_up.append(rid)
95
+ if len(tasks) >= settings.backup_resources_concurrency:
96
+ resources_bytes = await asyncio.gather(*tasks)
97
+ metadata.total_size += sum(resources_bytes)
98
+ metadata.missing_resources = [
99
+ rid for rid in metadata.missing_resources if rid not in backing_up
100
+ ]
101
+ await set_metadata(context, kbid, backup_id, metadata)
102
+ tasks = []
103
+ backing_up = []
104
+ if len(tasks) > 0:
105
+ resources_bytes = await asyncio.gather(*tasks)
106
+ metadata.total_size += sum(resources_bytes)
107
+ metadata.missing_resources = [rid for rid in metadata.missing_resources if rid not in backing_up]
108
+ await set_metadata(context, kbid, backup_id, metadata)
109
+ tasks = []
110
+ backing_up = []
111
+
112
+
113
+ async def backup_resource(context: ApplicationContext, backup_id: str, kbid: str, rid: str) -> int:
114
+ """
115
+ Backs up a resource to the blob storage service.
116
+ Returns the size of the resource in bytes.
117
+ """
118
+ bm = await get_broker_message(context, kbid, rid)
119
+ if bm is None:
120
+ # Resource not found. May have been deleted while the backup was running.
121
+ return 0
122
+ return await backup_resource_with_binaries(context, backup_id, kbid, rid, bm)
123
+
124
+
125
+ async def to_tar(name: str, size: int, chunks: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
126
+ """
127
+ This function is a generator that adds tar header and padding to the end of the chunks
128
+ to be compatible with the tar format.
129
+ """
130
+ tarinfo = tarfile.TarInfo(name)
131
+ tarinfo.size = size
132
+ tarinfo.mtime = int(datetime.now().timestamp())
133
+ tarinfo.mode = 0o644
134
+ tarinfo.type = tarfile.REGTYPE
135
+ header_bytes = tarinfo.tobuf(format=tarfile.GNU_FORMAT)
136
+ yield header_bytes
137
+ async for chunk in chunks:
138
+ yield chunk
139
+ if size % 512 != 0:
140
+ yield b"\x00" * (512 - (size % 512))
141
+
142
+
143
+ async def backup_resource_with_binaries(
144
+ context, backup_id: str, kbid: str, rid: str, bm: writer_pb2.BrokerMessage
145
+ ) -> int:
146
+ """
147
+ Generate a tar file dynamically with the resource broker message and all its binary files,
148
+ and stream it to the blob storage service. Returns the total size of the tar file in bytes.
149
+ """
150
+ total_size = 0
151
+
152
+ async def resource_data_iterator():
153
+ """
154
+ Each tar file will have the following structure:
155
+
156
+ - cloud-files/{cloud_file.uri} (serialized resources_pb2.CloudFile)
157
+ - binaries/{cloud_file.uri} (the actual binary content of the cloud file)
158
+ - broker-message.pb
159
+
160
+ The order is important because the restore process depends on it (needs to import
161
+ the cloud files and its binaries first before the broker message).
162
+ """
163
+ nonlocal total_size
164
+
165
+ for cloud_file in get_cloud_files(bm):
166
+ serialized_cf = cloud_file.SerializeToString()
167
+
168
+ async def cf_iterator():
169
+ yield serialized_cf
170
+
171
+ async for chunk in to_tar(
172
+ name=f"cloud-files/{cloud_file.uri}", size=len(serialized_cf), chunks=cf_iterator()
173
+ ):
174
+ yield chunk
175
+ total_size += len(chunk)
176
+
177
+ async for chunk in to_tar(
178
+ name=f"binaries/{cloud_file.uri}",
179
+ size=cloud_file.size,
180
+ chunks=download_binary(context, cloud_file),
181
+ ):
182
+ yield chunk
183
+ total_size += len(chunk)
184
+
185
+ bm_serialized = bm.SerializeToString()
186
+
187
+ async def bm_iterator():
188
+ yield bm_serialized
189
+
190
+ async for chunk in to_tar(
191
+ name="broker-message.pb", size=len(bm_serialized), chunks=bm_iterator()
192
+ ):
193
+ yield chunk
194
+ total_size += len(chunk)
195
+
196
+ await upload_to_bucket(
197
+ context,
198
+ resource_data_iterator(),
199
+ key=StorageKeys.RESOURCE.format(kbid=kbid, backup_id=backup_id, resource_id=rid),
200
+ )
201
+ return total_size
202
+
203
+
204
+ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
205
+ labels = await get_labels(context, kbid)
206
+ await context.blob_storage.upload_object(
207
+ bucket=settings.backups_bucket,
208
+ key=StorageKeys.LABELS.format(kbid=kbid, backup_id=backup_id),
209
+ data=labels.SerializeToString(),
210
+ )
211
+
212
+
213
+ async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
214
+ entities = await get_entities(context, kbid)
215
+ await context.blob_storage.upload_object(
216
+ bucket=settings.backups_bucket,
217
+ key=StorageKeys.ENTITIES.format(kbid=kbid, backup_id=backup_id),
218
+ data=entities.SerializeToString(),
219
+ )
220
+
221
+
222
+ async def get_metadata(
223
+ context: ApplicationContext, kbid: str, backup_id: str
224
+ ) -> Optional[BackupMetadata]:
225
+ async with context.kv_driver.transaction(read_only=True) as txn:
226
+ metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
227
+ if metadata_raw is None:
228
+ return None
229
+ return BackupMetadata.model_validate_json(metadata_raw)
230
+
231
+
232
+ async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, metadata: BackupMetadata):
233
+ async with context.kv_driver.transaction() as txn:
234
+ await txn.set(
235
+ MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id),
236
+ metadata.model_dump_json().encode(),
237
+ )
238
+ await txn.commit()
239
+
240
+
241
+ async def delete_metadata(context: ApplicationContext, kbid: str, backup_id: str):
242
+ async with context.kv_driver.transaction() as txn:
243
+ await txn.delete(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
244
+ await txn.commit()
245
+
246
+
247
+ async def upload_to_bucket(context: ApplicationContext, bytes_iterator: AsyncIterator[bytes], key: str):
248
+ storage = context.blob_storage
249
+ bucket = settings.backups_bucket
250
+ cf = resources_pb2.CloudFile()
251
+ cf.bucket_name = bucket
252
+ cf.content_type = "binary/octet-stream"
253
+ cf.source = resources_pb2.CloudFile.Source.EXPORT
254
+ field: StorageField = storage.field_klass(storage=storage, bucket=bucket, fullkey=key, field=cf)
255
+ await storage.uploaditerator(bytes_iterator, field, cf)
256
+
257
+
258
+ async def notify_backup_completed(context: ApplicationContext, kbid: str, backup_id: str):
259
+ audit = get_audit()
260
+ if audit is None or not isinstance(audit, StreamAuditStorage):
261
+ # We rely on the stream audit utility as it already holds a connection
262
+ # to the idp nats server. If it's not available, we can't send the notification.
263
+ return
264
+ metadata = await get_metadata(context, kbid, backup_id)
265
+ if metadata is None: # pragma: no cover
266
+ raise ValueError("Backup metadata not found")
267
+ notification = backups_pb2.BackupCreatedNotification()
268
+ notification.finished_at.FromDatetime(datetime.now(tz=timezone.utc))
269
+ notification.kb_id = kbid
270
+ notification.backup_id = backup_id
271
+ notification.size = metadata.total_size
272
+ notification.resources = metadata.total_resources
273
+ await audit.js.publish(
274
+ BackupFinishedStream.subject,
275
+ notification.SerializeToString(),
276
+ stream=BackupFinishedStream.name,
277
+ )
@@ -0,0 +1,72 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ import asyncio
23
+
24
+ from nucliadb.backups.const import StorageKeys
25
+ from nucliadb.backups.models import DeleteBackupRequest
26
+ from nucliadb.backups.settings import settings
27
+ from nucliadb.common.context import ApplicationContext
28
+
29
+
30
+ async def delete_backup_task(context: ApplicationContext, msg: DeleteBackupRequest):
31
+ """
32
+ Deletes the backup files from the cloud storage.
33
+ """
34
+ await delete_backup(context, msg.backup_id)
35
+
36
+
37
+ async def delete_backup(context: ApplicationContext, backup_id: str):
38
+ while True:
39
+ deleted = await delete_n(context, backup_id, n=1000)
40
+ if deleted == 0:
41
+ # No more objects to delete
42
+ break
43
+ await asyncio.sleep(1)
44
+
45
+
46
+ async def delete_n(context: ApplicationContext, backup_id: str, n: int):
47
+ concurrent_batch_size = 50
48
+ deleted = 0
49
+ tasks = []
50
+ async for object_info in context.blob_storage.iterate_objects(
51
+ bucket=settings.backups_bucket,
52
+ prefix=StorageKeys.BACKUP_PREFIX.format(backup_id=backup_id),
53
+ ):
54
+ if deleted >= n:
55
+ # Deleted enough objects
56
+ break
57
+ tasks.append(
58
+ asyncio.create_task(
59
+ context.blob_storage.delete_upload(
60
+ uri=object_info.name,
61
+ bucket_name=settings.backups_bucket,
62
+ )
63
+ )
64
+ )
65
+ deleted += 1
66
+ if len(tasks) > concurrent_batch_size:
67
+ await asyncio.gather(*tasks)
68
+ tasks = []
69
+ if len(tasks) > 0:
70
+ await asyncio.gather(*tasks)
71
+ tasks = []
72
+ return deleted
@@ -0,0 +1,44 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from datetime import datetime
21
+
22
+ from pydantic import BaseModel
23
+
24
+
25
+ class CreateBackupRequest(BaseModel):
26
+ kbid: str
27
+ backup_id: str
28
+
29
+
30
+ class RestoreBackupRequest(BaseModel):
31
+ kbid: str
32
+ backup_id: str
33
+
34
+
35
+ class DeleteBackupRequest(BaseModel):
36
+ backup_id: str
37
+
38
+
39
+ class BackupMetadata(BaseModel):
40
+ kbid: str
41
+ requested_at: datetime
42
+ total_resources: int = 0
43
+ missing_resources: list[str] = []
44
+ total_size: int = 0
@@ -0,0 +1,251 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ import asyncio
23
+ import functools
24
+ import tarfile
25
+ from typing import AsyncIterator, Callable, Optional, Union
26
+
27
+ from nucliadb.backups.const import MaindbKeys, StorageKeys
28
+ from nucliadb.backups.models import RestoreBackupRequest
29
+ from nucliadb.backups.settings import settings
30
+ from nucliadb.common.context import ApplicationContext
31
+ from nucliadb.export_import.utils import (
32
+ import_binary,
33
+ import_broker_message,
34
+ set_entities_groups,
35
+ set_labels,
36
+ )
37
+ from nucliadb.tasks.retries import TaskRetryHandler
38
+ from nucliadb_protos import knowledgebox_pb2 as kb_pb2
39
+ from nucliadb_protos.resources_pb2 import CloudFile
40
+ from nucliadb_protos.writer_pb2 import BrokerMessage
41
+
42
+
43
+ async def restore_kb_task(context: ApplicationContext, msg: RestoreBackupRequest):
44
+ kbid = msg.kbid
45
+ backup_id = msg.backup_id
46
+
47
+ retry_handler = TaskRetryHandler(
48
+ kbid=kbid,
49
+ task_type="restore",
50
+ task_id=backup_id,
51
+ context=context,
52
+ max_retries=3,
53
+ )
54
+
55
+ @retry_handler.wrap
56
+ async def _restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
57
+ await restore_kb(context, kbid, backup_id)
58
+
59
+ await _restore_kb(context, kbid, backup_id)
60
+
61
+
62
+ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
63
+ """
64
+ Downloads the backup files from the cloud storage and imports them into the KB.
65
+ """
66
+ await restore_resources(context, kbid, backup_id)
67
+ await restore_labels(context, kbid, backup_id)
68
+ await restore_entities(context, kbid, backup_id)
69
+ await delete_last_restored_resource_key(context, kbid, backup_id)
70
+
71
+
72
+ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: str):
73
+ last_restored = await get_last_restored_resource_key(context, kbid, backup_id)
74
+ tasks = []
75
+ async for object_info in context.blob_storage.iterate_objects(
76
+ bucket=settings.backups_bucket,
77
+ prefix=StorageKeys.RESOURCES_PREFIX.format(kbid=kbid, backup_id=backup_id),
78
+ start=last_restored,
79
+ ):
80
+ key = object_info.name
81
+ resource_id = key.split("/")[-1].rstrip(".tar")
82
+ tasks.append(asyncio.create_task(restore_resource(context, kbid, backup_id, resource_id)))
83
+ if len(tasks) > settings.restore_resources_concurrency:
84
+ await asyncio.gather(*tasks)
85
+ tasks = []
86
+ await set_last_restored_resource_key(context, kbid, backup_id, key)
87
+ if len(tasks) > 0:
88
+ await asyncio.gather(*tasks)
89
+ tasks = []
90
+ await set_last_restored_resource_key(context, kbid, backup_id, key)
91
+
92
+
93
+ async def get_last_restored_resource_key(
94
+ context: ApplicationContext, kbid: str, backup_id: str
95
+ ) -> Optional[str]:
96
+ key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
97
+ async with context.kv_driver.transaction(read_only=True) as txn:
98
+ raw = await txn.get(key)
99
+ if raw is None:
100
+ return None
101
+ return raw.decode()
102
+
103
+
104
+ async def set_last_restored_resource_key(
105
+ context: ApplicationContext, kbid: str, backup_id: str, resource_id: str
106
+ ):
107
+ key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
108
+ async with context.kv_driver.transaction() as txn:
109
+ await txn.set(key, resource_id.encode())
110
+ await txn.commit()
111
+
112
+
113
+ async def delete_last_restored_resource_key(context: ApplicationContext, kbid: str, backup_id: str):
114
+ key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
115
+ async with context.kv_driver.transaction() as txn:
116
+ await txn.delete(key)
117
+ await txn.commit()
118
+
119
+
120
+ class CloudFileBinary:
121
+ def __init__(self, uri: str, download_stream: Callable[[int], AsyncIterator[bytes]]):
122
+ self.uri = uri
123
+ self.download_stream = download_stream
124
+
125
+ async def read(self, chunk_size: int) -> AsyncIterator[bytes]:
126
+ async for chunk in self.download_stream(chunk_size):
127
+ yield chunk
128
+
129
+
130
+ class ResourceBackupReader:
131
+ def __init__(self, download_stream: AsyncIterator[bytes]):
132
+ self.download_stream = download_stream
133
+ self.buffer = b""
134
+
135
+ async def read(self, size: int) -> bytes:
136
+ while len(self.buffer) < size:
137
+ chunk = await self.download_stream.__anext__()
138
+ if not chunk:
139
+ continue
140
+ self.buffer += chunk
141
+ result = self.buffer[:size]
142
+ self.buffer = self.buffer[size:]
143
+ return result
144
+
145
+ async def iter_data(self, total_bytes: int, chunk_size: int = 1024 * 1024) -> AsyncIterator[bytes]:
146
+ padding_bytes = 0
147
+ if total_bytes % 512 != 0:
148
+ # We need to read the padding bytes and then discard them
149
+ padding_bytes = 512 - (total_bytes % 512)
150
+ read_bytes = 0
151
+ padding_reached = False
152
+ async for chunk in self._iter(total_bytes + padding_bytes, chunk_size):
153
+ if padding_reached:
154
+ # Skip padding bytes. We can't break here because we need
155
+ # to read the padding bytes from the stream
156
+ continue
157
+ padding_reached = read_bytes + len(chunk) >= total_bytes
158
+ if padding_reached:
159
+ chunk = chunk[: total_bytes - read_bytes]
160
+ else:
161
+ read_bytes += len(chunk)
162
+ yield chunk
163
+
164
+ async def _iter(self, total_bytes: int, chunk_size: int = 1024 * 1024) -> AsyncIterator[bytes]:
165
+ remaining_bytes = total_bytes
166
+ while remaining_bytes > 0:
167
+ to_read = min(chunk_size, remaining_bytes)
168
+ chunk = await self.read(to_read)
169
+ yield chunk
170
+ remaining_bytes -= len(chunk)
171
+ assert remaining_bytes == 0
172
+
173
+ async def read_tarinfo(self):
174
+ raw_tar_header = await self.read(512)
175
+ return tarfile.TarInfo.frombuf(raw_tar_header, encoding="utf-8", errors="strict")
176
+
177
+ async def read_data(self, tarinfo: tarfile.TarInfo) -> bytes:
178
+ tarinfo_size = tarinfo.size
179
+ padding_bytes = 0
180
+ if tarinfo_size % 512 != 0:
181
+ # We need to read the padding bytes and then discard them
182
+ padding_bytes = 512 - (tarinfo_size % 512)
183
+ data = await self.read(tarinfo_size + padding_bytes)
184
+ return data[:tarinfo_size]
185
+
186
+ async def read_item(self) -> Union[BrokerMessage, CloudFile, CloudFileBinary]:
187
+ tarinfo = await self.read_tarinfo()
188
+ if tarinfo.name.startswith("broker-message"):
189
+ raw_bm = await self.read_data(tarinfo)
190
+ bm = BrokerMessage()
191
+ bm.ParseFromString(raw_bm)
192
+ return bm
193
+ elif tarinfo.name.startswith("cloud-files"):
194
+ raw_cf = await self.read_data(tarinfo)
195
+ cf = CloudFile()
196
+ cf.FromString(raw_cf)
197
+ return cf
198
+ elif tarinfo.name.startswith("binaries"):
199
+ uri = tarinfo.name.lstrip("binaries/")
200
+ size = tarinfo.size
201
+ download_stream = functools.partial(self.iter_data, size)
202
+ return CloudFileBinary(uri, download_stream)
203
+ else: # pragma: no cover
204
+ raise ValueError(f"Unknown tar entry: {tarinfo.name}")
205
+
206
+
207
+ async def restore_resource(context: ApplicationContext, kbid: str, backup_id: str, resource_id: str):
208
+ download_stream = context.blob_storage.download(
209
+ bucket=settings.backups_bucket,
210
+ key=StorageKeys.RESOURCE.format(kbid=kbid, backup_id=backup_id, resource_id=resource_id),
211
+ )
212
+ reader = ResourceBackupReader(download_stream)
213
+ bm = None
214
+ while True:
215
+ item = await reader.read_item()
216
+ if isinstance(item, BrokerMessage):
217
+ # When the broker message is read, this means all cloud files
218
+ # and binaries of that resource have been read and imported
219
+ bm = item
220
+ bm.kbid = kbid
221
+ break
222
+
223
+ # Read the cloud file and its binary
224
+ cf = await reader.read_item()
225
+ assert isinstance(cf, CloudFile)
226
+ cf_binary = await reader.read_item()
227
+ assert isinstance(cf_binary, CloudFileBinary)
228
+ assert cf.uri == cf_binary.uri
229
+ await import_binary(context, kbid, cf, cf_binary.read)
230
+
231
+ await import_broker_message(context, kbid, bm)
232
+
233
+
234
+ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str):
235
+ raw = await context.blob_storage.downloadbytes(
236
+ bucket=settings.backups_bucket,
237
+ key=StorageKeys.LABELS.format(kbid=kbid, backup_id=backup_id),
238
+ )
239
+ labels = kb_pb2.Labels()
240
+ labels.ParseFromString(raw.getvalue())
241
+ await set_labels(context, kbid, labels)
242
+
243
+
244
+ async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
245
+ raw = await context.blob_storage.downloadbytes(
246
+ bucket=settings.backups_bucket,
247
+ key=StorageKeys.ENTITIES.format(kbid=kbid, backup_id=backup_id),
248
+ )
249
+ entities = kb_pb2.EntitiesGroups()
250
+ entities.ParseFromString(raw.getvalue())
251
+ await set_entities_groups(context, kbid, entities)
@@ -0,0 +1,37 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from pydantic import Field
22
+ from pydantic_settings import BaseSettings
23
+
24
+
25
+ class BackupSettings(BaseSettings):
26
+ backups_bucket: str = Field(
27
+ default="backups", description="The bucket where the backups are stored."
28
+ )
29
+ restore_resources_concurrency: int = Field(
30
+ default=10, description="The number of concurrent resource restores."
31
+ )
32
+ backup_resources_concurrency: int = Field(
33
+ default=10, description="The number of concurrent resource backups."
34
+ )
35
+
36
+
37
+ settings = BackupSettings()
@@ -0,0 +1,127 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from typing import Awaitable, Callable
21
+
22
+ from nucliadb.backups.const import BackupsNatsStream
23
+ from nucliadb.backups.create import backup_kb_task
24
+ from nucliadb.backups.delete import delete_backup
25
+ from nucliadb.backups.models import CreateBackupRequest, DeleteBackupRequest, RestoreBackupRequest
26
+ from nucliadb.backups.restore import restore_kb_task
27
+ from nucliadb.common.context import ApplicationContext
28
+ from nucliadb.tasks import create_consumer, create_producer
29
+ from nucliadb.tasks.consumer import NatsTaskConsumer
30
+ from nucliadb.tasks.producer import NatsTaskProducer
31
+
32
+
33
+ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
34
+ consumer: NatsTaskConsumer = create_consumer(
35
+ name="backup_creator",
36
+ stream=BackupsNatsStream.name,
37
+ stream_subjects=BackupsNatsStream.stream_subjects,
38
+ consumer_subject=BackupsNatsStream.create_subject,
39
+ callback=backup_kb_task,
40
+ msg_type=CreateBackupRequest,
41
+ max_concurrent_messages=10,
42
+ )
43
+ return consumer
44
+
45
+
46
+ async def create(kbid: str, backup_id: str) -> None:
47
+ producer: NatsTaskProducer[CreateBackupRequest] = create_producer(
48
+ name="backup_creator",
49
+ stream=BackupsNatsStream.name,
50
+ stream_subjects=BackupsNatsStream.stream_subjects,
51
+ producer_subject=BackupsNatsStream.create_subject,
52
+ msg_type=CreateBackupRequest,
53
+ )
54
+ msg = CreateBackupRequest(
55
+ kbid=kbid,
56
+ backup_id=backup_id,
57
+ )
58
+ await producer.send(msg)
59
+
60
+
61
+ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
62
+ consumer: NatsTaskConsumer = create_consumer(
63
+ name="backup_restorer",
64
+ stream=BackupsNatsStream.name,
65
+ stream_subjects=BackupsNatsStream.stream_subjects,
66
+ consumer_subject=BackupsNatsStream.restore_subject,
67
+ callback=restore_kb_task,
68
+ msg_type=RestoreBackupRequest,
69
+ max_concurrent_messages=10,
70
+ )
71
+ return consumer
72
+
73
+
74
+ async def restore(kbid: str, backup_id: str) -> None:
75
+ producer: NatsTaskProducer[RestoreBackupRequest] = create_producer(
76
+ name="backup_restorer",
77
+ stream=BackupsNatsStream.name,
78
+ stream_subjects=BackupsNatsStream.stream_subjects,
79
+ producer_subject=BackupsNatsStream.restore_subject,
80
+ msg_type=RestoreBackupRequest,
81
+ )
82
+ msg = RestoreBackupRequest(
83
+ kbid=kbid,
84
+ backup_id=backup_id,
85
+ )
86
+ await producer.send(msg)
87
+
88
+
89
+ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
90
+ consumer: NatsTaskConsumer = create_consumer(
91
+ name="backup_deleter",
92
+ stream=BackupsNatsStream.name,
93
+ stream_subjects=BackupsNatsStream.stream_subjects,
94
+ consumer_subject=BackupsNatsStream.delete_subject,
95
+ callback=delete_backup,
96
+ msg_type=DeleteBackupRequest,
97
+ max_concurrent_messages=2,
98
+ )
99
+ return consumer
100
+
101
+
102
+ async def delete(backup_id: str) -> None:
103
+ producer: NatsTaskProducer[DeleteBackupRequest] = create_producer(
104
+ name="backup_deleter",
105
+ stream=BackupsNatsStream.name,
106
+ stream_subjects=BackupsNatsStream.stream_subjects,
107
+ producer_subject=BackupsNatsStream.delete_subject,
108
+ msg_type=DeleteBackupRequest,
109
+ )
110
+ msg = DeleteBackupRequest(
111
+ backup_id=backup_id,
112
+ )
113
+ await producer.send(msg)
114
+
115
+
116
+ async def initialize_consumers(context: ApplicationContext) -> list[Callable[[], Awaitable[None]]]:
117
+ creator = creator_consumer()
118
+ restorer = restorer_consumer()
119
+ deleter = deleter_consumer()
120
+ await creator.initialize(context)
121
+ await restorer.initialize(context)
122
+ await deleter.initialize(context)
123
+ return [
124
+ creator.finalize,
125
+ restorer.finalize,
126
+ deleter.finalize,
127
+ ]
@@ -0,0 +1,32 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.backups.const import StorageKeys
22
+ from nucliadb.backups.settings import settings
23
+ from nucliadb_utils.storages.storage import Storage
24
+
25
+
26
+ async def exists_backup(storage: Storage, backup_id: str) -> bool:
27
+ async for _ in storage.iterate_objects(
28
+ bucket=settings.backups_bucket,
29
+ prefix=StorageKeys.BACKUP_PREFIX.format(backup_id=backup_id),
30
+ ):
31
+ return True
32
+ return False
@@ -40,7 +40,7 @@ from nucliadb_protos import resources_pb2, writer_pb2
40
40
  from nucliadb_utils.const import Streams
41
41
  from nucliadb_utils.transaction import MaxTransactionSizeExceededError
42
42
 
43
- BinaryStream = AsyncGenerator[bytes, None]
43
+ BinaryStream = AsyncIterator[bytes]
44
44
  BinaryStreamGenerator = Callable[[int], BinaryStream]
45
45
 
46
46
 
@@ -237,8 +237,11 @@ async def download_binary(
237
237
  context: ApplicationContext, cf: resources_pb2.CloudFile
238
238
  ) -> AsyncGenerator[bytes, None]:
239
239
  bucket_name = context.blob_storage.get_bucket_name_from_cf(cf)
240
+ downloaded_bytes = 0
240
241
  async for data in context.blob_storage.download(bucket_name, cf.uri):
241
242
  yield data
243
+ downloaded_bytes += len(data)
244
+ assert downloaded_bytes == cf.size, "Downloaded bytes do not match the expected size"
242
245
 
243
246
 
244
247
  async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
@@ -416,6 +419,8 @@ class ExportStreamReader:
416
419
  class TaskRetryHandler:
417
420
  """
418
421
  Class that wraps an import/export task and adds retry logic to it.
422
+
423
+ TODO: This should be refactored to use generic task retry logic at tasks/retries.py::TaskRetryHandler
419
424
  """
420
425
 
421
426
  def __init__(
nucliadb/ingest/app.py CHANGED
@@ -22,6 +22,7 @@ import importlib.metadata
22
22
  from typing import Awaitable, Callable
23
23
 
24
24
  from nucliadb import health
25
+ from nucliadb.backups.tasks import initialize_consumers as initialize_backup_consumers
25
26
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
26
27
  from nucliadb.common.context import ApplicationContext
27
28
  from nucliadb.common.nidx import start_nidx_utility
@@ -154,6 +155,7 @@ async def main_subscriber_workers(): # pragma: no cover
154
155
  await exports_consumer.initialize(context)
155
156
  imports_consumer = get_imports_consumer()
156
157
  await imports_consumer.initialize(context)
158
+ backup_consumers_finalizers = await initialize_backup_consumers(context)
157
159
 
158
160
  await run_until_exit(
159
161
  [
@@ -165,7 +167,10 @@ async def main_subscriber_workers(): # pragma: no cover
165
167
  metrics_server.shutdown,
166
168
  grpc_health_finalizer,
167
169
  context.finalize,
170
+ exports_consumer.finalize,
171
+ imports_consumer.finalize,
168
172
  ]
173
+ + backup_consumers_finalizers
169
174
  + finalizers
170
175
  )
171
176
 
@@ -216,6 +221,7 @@ def run_subscriber_workers() -> None: # pragma: no cover
216
221
  - audit fields subscriber
217
222
  - export/import subscriber
218
223
  - materializer subscriber
224
+ - backups subscribers
219
225
  """
220
226
  setup_configuration()
221
227
  asyncio.run(main_subscriber_workers())
@@ -20,6 +20,8 @@
20
20
  import uuid
21
21
  from typing import AsyncIterator
22
22
 
23
+ from nucliadb.backups import tasks as backup_tasks
24
+ from nucliadb.backups import utils as backup_utils
23
25
  from nucliadb.common import datamanagers
24
26
  from nucliadb.common.cluster.exceptions import AlreadyExists, EntitiesGroupNotFound
25
27
  from nucliadb.common.cluster.manager import get_nidx_fake_node
@@ -471,14 +473,37 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
471
473
  async def CreateBackup(
472
474
  self, request: backups_pb2.CreateBackupRequest, context=None
473
475
  ) -> backups_pb2.CreateBackupResponse:
474
- return backups_pb2.CreateBackupResponse()
476
+ if not await exists_kb(request.kbid):
477
+ return backups_pb2.CreateBackupResponse(
478
+ status=backups_pb2.CreateBackupResponse.Status.KB_NOT_FOUND
479
+ )
480
+ await backup_tasks.create(request.kbid, request.backup_id)
481
+ return backups_pb2.CreateBackupResponse(status=backups_pb2.CreateBackupResponse.Status.OK)
475
482
 
476
483
  async def DeleteBackup(
477
484
  self, request: backups_pb2.DeleteBackupRequest, context=None
478
485
  ) -> backups_pb2.DeleteBackupResponse:
479
- return backups_pb2.DeleteBackupResponse()
486
+ if not await backup_utils.exists_backup(self.storage, request.backup_id):
487
+ return backups_pb2.DeleteBackupResponse(
488
+ status=backups_pb2.DeleteBackupResponse.Status.OK,
489
+ )
490
+ await backup_tasks.delete(request.backup_id)
491
+ return backups_pb2.DeleteBackupResponse(status=backups_pb2.DeleteBackupResponse.Status.OK)
480
492
 
481
493
  async def RestoreBackup(
482
494
  self, request: backups_pb2.RestoreBackupRequest, context=None
483
495
  ) -> backups_pb2.RestoreBackupResponse:
484
- return backups_pb2.RestoreBackupResponse()
496
+ if not await exists_kb(request.kbid):
497
+ return backups_pb2.RestoreBackupResponse(
498
+ status=backups_pb2.RestoreBackupResponse.Status.NOT_FOUND
499
+ )
500
+ if not await backup_utils.exists_backup(self.storage, request.backup_id):
501
+ return backups_pb2.RestoreBackupResponse(
502
+ status=backups_pb2.RestoreBackupResponse.Status.NOT_FOUND
503
+ )
504
+ await backup_tasks.restore(request.kbid, request.backup_id)
505
+ return backups_pb2.RestoreBackupResponse(status=backups_pb2.RestoreBackupResponse.Status.OK)
506
+
507
+
508
+ async def exists_kb(kbid: str) -> bool:
509
+ return await datamanagers.atomic.kb.exists_kb(kbid=kbid)
@@ -17,5 +17,5 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from .consumer import NatsTaskConsumer, create_consumer # noqa
21
- from .producer import NatsTaskProducer, create_producer # noqa
20
+ from .consumer import NatsTaskConsumer, create_consumer # noqa: F401
21
+ from .producer import NatsTaskProducer, create_producer # noqa: F401
@@ -0,0 +1,148 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import functools
21
+ import logging
22
+ from enum import Enum
23
+ from typing import Optional
24
+
25
+ from pydantic import BaseModel
26
+
27
+ from nucliadb.common.context import ApplicationContext
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ TASK_METADATA_MAINDB = "/kbs/{kbid}/tasks/{task_type}/{task_id}"
32
+
33
+
34
+ class TaskMetadata(BaseModel):
35
+ class Status(Enum):
36
+ RUNNING = "running"
37
+ FAILED = "failed"
38
+ COMPLETED = "completed"
39
+
40
+ task_id: str
41
+ status: Status
42
+ retries: int = 0
43
+ error_messages: list[str] = []
44
+
45
+
46
+ class TaskRetryHandler:
47
+ """
48
+ Class that wraps a task consumer function and handles retries by storing metadata in the KV store.
49
+
50
+ Example:
51
+
52
+ retry_handler = TaskRetryHandler(
53
+ kbid="kbid",
54
+ task_type="work",
55
+ task_id="task_id",
56
+ context=context,
57
+ )
58
+
59
+ @retry_handler.wrap
60
+ async def my_task_consumer_func(kbid: str, task_id: str):
61
+ pass
62
+
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ kbid: str,
68
+ task_type: str,
69
+ task_id: str,
70
+ context: ApplicationContext,
71
+ max_retries: int = 5,
72
+ ):
73
+ self.kbid = kbid
74
+ self.task_type = task_type
75
+ self.task_id = task_id
76
+ self.max_retries = max_retries
77
+ self.context = context
78
+
79
+ @property
80
+ def metadata_key(self):
81
+ return TASK_METADATA_MAINDB.format(
82
+ kbid=self.kbid, task_type=self.task_type, task_id=self.task_id
83
+ )
84
+
85
+ async def get_metadata(self) -> Optional[TaskMetadata]:
86
+ async with self.context.kv_driver.transaction(read_only=True) as txn:
87
+ metadata = await txn.get(self.metadata_key)
88
+ if metadata is None:
89
+ return None
90
+ return TaskMetadata.model_validate_json(metadata)
91
+
92
+ async def set_metadata(self, metadata: TaskMetadata) -> None:
93
+ async with self.context.kv_driver.transaction() as txn:
94
+ await txn.set(self.metadata_key, metadata.model_dump_json().encode())
95
+ await txn.commit()
96
+
97
+ def wrap(self, func):
98
+ @functools.wraps(func)
99
+ async def wrapper(*args, **kwargs):
100
+ func_result = None
101
+ metadata = await self.get_metadata()
102
+ if metadata is None:
103
+ # Task is not scheduled yet
104
+ metadata = TaskMetadata(
105
+ task_id=self.task_id,
106
+ status=TaskMetadata.Status.RUNNING,
107
+ retries=0,
108
+ )
109
+ await self.set_metadata(metadata)
110
+
111
+ if metadata.status in (TaskMetadata.Status.COMPLETED, TaskMetadata.Status.FAILED):
112
+ logger.info(
113
+ f"{self.type} task is {metadata.status.value}. Skipping",
114
+ extra={"kbid": self.kbid, "task_type": self.task_type, "task_id": self.task_id},
115
+ )
116
+ return
117
+
118
+ if metadata.retries >= self.max_retries:
119
+ metadata.status = TaskMetadata.Status.FAILED
120
+ metadata.error_messages.append("Max retries reached")
121
+ logger.info(
122
+ f"Task reached max retries. Setting to FAILED state",
123
+ extra={"kbid": self.kbid, "task_type": self.task_type, "task_id": self.task_id},
124
+ )
125
+ await self.set_metadata(metadata)
126
+ return
127
+ try:
128
+ metadata.status = TaskMetadata.Status.RUNNING
129
+ func_result = await func(*args, **kwargs)
130
+ except Exception as ex:
131
+ metadata.retries += 1
132
+ metadata.error_messages.append(str(ex))
133
+ logger.info(
134
+ f"Task failed. Will be retried",
135
+ extra={"kbid": self.kbid, "task_type": self.task_type, "task_id": self.task_id},
136
+ )
137
+ raise
138
+ else:
139
+ logger.info(
140
+ f"Task finished successfully",
141
+ extra={"kbid": self.kbid, "task_type": self.task_type, "task_id": self.task_id},
142
+ )
143
+ metadata.status = TaskMetadata.Status.COMPLETED
144
+ return func_result
145
+ finally:
146
+ await self.set_metadata(metadata)
147
+
148
+ return wrapper
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: nucliadb
3
- Version: 6.3.1.post3526
3
+ Version: 6.3.1.post3544
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.1.post3526
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.1.post3526
25
- Requires-Dist: nucliadb-protos>=6.3.1.post3526
26
- Requires-Dist: nucliadb-models>=6.3.1.post3526
27
- Requires-Dist: nidx-protos>=6.3.1.post3526
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.1.post3544
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.1.post3544
25
+ Requires-Dist: nucliadb-protos>=6.3.1.post3544
26
+ Requires-Dist: nucliadb-models>=6.3.1.post3544
27
+ Requires-Dist: nidx-protos>=6.3.1.post3544
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn
@@ -39,6 +39,15 @@ nucliadb/learning_proxy.py,sha256=Gf76qXxjl1lrHEFaCpOUfjjf0ab6eGLNxLMJz3-M_mo,19
39
39
  nucliadb/metrics_exporter.py,sha256=6u0geEYFxgE5I2Fhl_sxsvGN-ZkaFZNGutSXwrzrsVs,5624
40
40
  nucliadb/openapi.py,sha256=wDiw0dVEvTpJvbatkJ0JZLkKm9RItZT5PWRHjqRfqTA,2272
41
41
  nucliadb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ nucliadb/backups/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
43
+ nucliadb/backups/const.py,sha256=llb5TaC53Ce6BMmlPKPUrVhVGl7uQrqv_Vle-P4GET4,1673
44
+ nucliadb/backups/create.py,sha256=4RsvwY808X22kr06_uiASz6hEqmMirWQZgA_RoTbkqw,10440
45
+ nucliadb/backups/delete.py,sha256=1rnBhVUGYYZJXSZUrrgYMDZ5NyswEWkIA-G-crRCyHk,2404
46
+ nucliadb/backups/models.py,sha256=13-Z4p-Ypjdtg5NuDE2m-09CTdFYHh-W6U9FyWSEhPA,1270
47
+ nucliadb/backups/restore.py,sha256=X-Ai5HjujNWIjqxegDaJp33dSUIDaTzJ3K8n_heeDeo,9702
48
+ nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,1357
49
+ nucliadb/backups/tasks.py,sha256=e0J85c7RjqYO92hcG9GT_g_LK-enyisWuSWxAUl5IZE,4528
50
+ nucliadb/backups/utils.py,sha256=ayDaxfWP5cPnAkQH-tF4M6cnowsPQgU2ljYz_iL1CbE,1249
42
51
  nucliadb/common/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
43
52
  nucliadb/common/constants.py,sha256=QpigxJh_CtD85Evy0PtV5cVq6x0U_f9xfIcXz1ymkUg,869
44
53
  nucliadb/common/counters.py,sha256=8lOi3A2HeLDDlcNaS2QT1SfD3350VPBjiY3FkmHH1V8,977
@@ -101,9 +110,9 @@ nucliadb/export_import/exporter.py,sha256=k2QVx1EjqFlDYiggriWiEJzwtMXzHbldsqWdpG
101
110
  nucliadb/export_import/importer.py,sha256=v5cq9Nn8c2zrY_K_00mydR52f8mdFxR7tLdtNLQ0qvk,4229
102
111
  nucliadb/export_import/models.py,sha256=dbjScNkiMRv4X3Ktudy1JRliD25bfoDTy3JmEZgQSCc,2121
103
112
  nucliadb/export_import/tasks.py,sha256=yPNdBdvTD7eGc7zvV9Rp7UZ0-mDhA34OOsLqHvns_v0,2975
104
- nucliadb/export_import/utils.py,sha256=t7xLA3f5W3zGq3HNXe3noOQnY7gRO8TAoe8S8BG3_so,19733
113
+ nucliadb/export_import/utils.py,sha256=iAQAjYuNx0dhM2b5-1A0NEs8tSRsznuT-izysUrTwS0,19986
105
114
  nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
106
- nucliadb/ingest/app.py,sha256=ErNd3q44xbBcUOl-Ae_lvcKPAsfFMSzb8zqWAjekrM4,7097
115
+ nucliadb/ingest/app.py,sha256=rX1KE5vsAzG9hlArBk8WE2SOlvdYylcb-jNkMQNPJdQ,7407
107
116
  nucliadb/ingest/cache.py,sha256=w7jMMzamOmQ7gwXna6Dqm6isRNBVv6l5BTBlTxaYWjE,1005
108
117
  nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
109
118
  nucliadb/ingest/processing.py,sha256=8OggvuxNzktTTKDTUwsIuazhDParEWhn46CBZaMYAy8,20659
@@ -145,7 +154,7 @@ nucliadb/ingest/orm/processor/processor.py,sha256=oaiZ9HUszhUbvNMCmDq5Xj_jtXiCDj
145
154
  nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
146
155
  nucliadb/ingest/service/__init__.py,sha256=MME_G_ERxzJR6JW_hfE2qcfXpmpH1kdG-S0a-M0qRm8,2043
147
156
  nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
148
- nucliadb/ingest/service/writer.py,sha256=69xsQgoWngfZfWFTKPbePi-ya4fP5T7dTiomLr1gSYw,20953
157
+ nucliadb/ingest/service/writer.py,sha256=XISw3m5joj30gKe9CfpyybXMwgWOueAcgfiHRnX4Cqc,22311
149
158
  nucliadb/middleware/__init__.py,sha256=A8NBlBuEkunCFMKpR9gnfNELsVn0Plc55BIQMbWDM8Q,2202
150
159
  nucliadb/migrator/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
151
160
  nucliadb/migrator/command.py,sha256=dKbJ1tAmP6X4lMVRSSlz351euaqs2wBPpOczLjATUes,2089
@@ -260,11 +269,12 @@ nucliadb/standalone/versions.py,sha256=8CxNMNt2NgWM8ct50UsR4d44-ae7wtQI-sV-yGiFq
260
269
  nucliadb/standalone/static/favicon.ico,sha256=96pKGp6Sx457JkTfjy1dtApMhkitixfU6invCUGAYOU,2285
261
270
  nucliadb/standalone/static/index.html,sha256=PEZfuEQFYnYACAL1ceN8xC0im8lBrUx838RkE8tbvgA,3833
262
271
  nucliadb/standalone/static/logo.svg,sha256=-wQqSvPGTdlKjUP6pHE6kiq005pgYjDzp9nPl0X71Mk,2639
263
- nucliadb/tasks/__init__.py,sha256=vruCOMmCu0BcAZQrEinlgiQtiR1WYxSxvI5UsydAopc,963
272
+ nucliadb/tasks/__init__.py,sha256=oFJ3A8HD7w11mBu-IixYE_KxA7juMGlYQb7YD_y6WPM,975
264
273
  nucliadb/tasks/consumer.py,sha256=x-999Nsw6lBcKvyGyCGPiGP_naANVYMfl9M-u0U3mhY,7052
265
274
  nucliadb/tasks/logger.py,sha256=C7keOEO_mjLVp5VbqAZ2QXfqVB2Hot7NgBlUP_SDSMw,924
266
275
  nucliadb/tasks/models.py,sha256=qrZKi5DNDQ07waMsp5L4_Fi7WRs57YiO-kmXlrBzEAA,1168
267
276
  nucliadb/tasks/producer.py,sha256=w4R1YXgXtmCPGcoNNOr3qkqJYcHJtSmix-xjt7vsPqk,3261
277
+ nucliadb/tasks/retries.py,sha256=Zv-3Hys-SKayG9VQ7_7EIflkegE5j-xPGrf-nwaxsfY,5075
268
278
  nucliadb/tasks/utils.py,sha256=6tQVckqyzxv8PhVAd3ZqcMYpGcn73ZY6p1cpm1FxagA,1214
269
279
  nucliadb/tests/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
270
280
  nucliadb/tests/config.py,sha256=JN_Jhgj-fwM9_8IeO9pwxr6C1PiwRDrXxm67Y38rU30,2080
@@ -339,8 +349,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
339
349
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
340
350
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
341
351
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
342
- nucliadb-6.3.1.post3526.dist-info/METADATA,sha256=eyuN6ZoyLypgZCawSpKtbge-d3fzg3KatkY51mDYxAc,4291
343
- nucliadb-6.3.1.post3526.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
344
- nucliadb-6.3.1.post3526.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
345
- nucliadb-6.3.1.post3526.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
346
- nucliadb-6.3.1.post3526.dist-info/RECORD,,
352
+ nucliadb-6.3.1.post3544.dist-info/METADATA,sha256=vKxvrIe5oh3QKwI-JxIAA8NJ2V0bwXCdg6K_O4wOCFg,4291
353
+ nucliadb-6.3.1.post3544.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
354
+ nucliadb-6.3.1.post3544.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
355
+ nucliadb-6.3.1.post3544.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
356
+ nucliadb-6.3.1.post3544.dist-info/RECORD,,