nucliadb 6.3.1.post3524__py3-none-any.whl → 6.3.1.post3531__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
@@ -0,0 +1,41 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ class MaindbKeys:
23
+ METADATA = "kbs/{kbid}/backups/{backup_id}"
24
+ LAST_RESTORED = "kbs/{kbid}/backup/{backup_id}/last_restored"
25
+
26
+
27
+ class StorageKeys:
28
+ """
29
+ Defines the key templates used to store backup files in the backups bucket of the storage.
30
+ """
31
+
32
+ BACKUP_PREFIX = "backups/{backup_id}/"
33
+ RESOURCES_PREFIX = "backups/{backup_id}/resources/"
34
+ RESOURCE = "backups/{backup_id}/resources/{resource_id}.tar"
35
+ ENTITIES = "backups/{backup_id}/entities.pb"
36
+ LABELS = "backups/{backup_id}/labels.pb"
37
+
38
+
39
+ class BackupFinishedStream:
40
+ name = "backups"
41
+ subject = "backups.creation_finished"
@@ -0,0 +1,277 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ import tarfile
22
+ from datetime import datetime, timezone
23
+ from typing import AsyncIterator, Optional
24
+
25
+ from nucliadb.backups.const import (
26
+ BackupFinishedStream,
27
+ MaindbKeys,
28
+ StorageKeys,
29
+ )
30
+ from nucliadb.backups.models import BackupMetadata, CreateBackupRequest
31
+ from nucliadb.backups.settings import settings
32
+ from nucliadb.common import datamanagers
33
+ from nucliadb.common.context import ApplicationContext
34
+ from nucliadb.export_import.utils import (
35
+ download_binary,
36
+ get_broker_message,
37
+ get_cloud_files,
38
+ get_entities,
39
+ get_labels,
40
+ )
41
+ from nucliadb.tasks.retries import TaskRetryHandler
42
+ from nucliadb_protos import backups_pb2, resources_pb2, writer_pb2
43
+ from nucliadb_utils.audit.stream import StreamAuditStorage
44
+ from nucliadb_utils.storages.storage import StorageField
45
+ from nucliadb_utils.utilities import get_audit
46
+
47
+
48
+ async def backup_kb_retried(context: ApplicationContext, msg: CreateBackupRequest):
49
+ kbid = msg.kbid
50
+ backup_id = msg.backup_id
51
+
52
+ retry_handler = TaskRetryHandler(
53
+ kbid=kbid,
54
+ task_type="backup",
55
+ task_id=backup_id,
56
+ context=context,
57
+ max_retries=5,
58
+ )
59
+
60
+ @retry_handler.wrap
61
+ async def _backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
62
+ await backup_kb(context, kbid, backup_id)
63
+
64
+ await _backup_kb(context, kbid, backup_id)
65
+
66
+
67
+ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
68
+ """
69
+ Backs up a KB to the cloud storage.
70
+ """
71
+ await backup_resources(context, kbid, backup_id)
72
+ await backup_labels(context, kbid, backup_id)
73
+ await backup_entities(context, kbid, backup_id)
74
+ await notify_backup_completed(context, kbid, backup_id)
75
+ await delete_metadata(context, kbid, backup_id)
76
+
77
+
78
+ async def backup_resources(context: ApplicationContext, kbid: str, backup_id: str):
79
+ metadata = await get_metadata(context, kbid, backup_id)
80
+ if metadata is None:
81
+ metadata = BackupMetadata(
82
+ kbid=kbid,
83
+ requested_at=datetime.now(tz=timezone.utc),
84
+ )
85
+ async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
86
+ metadata.total_resources += 1
87
+ metadata.missing_resources.append(rid)
88
+ metadata.missing_resources.sort()
89
+ await set_metadata(context, kbid, backup_id, metadata)
90
+ tasks = []
91
+ backing_up = []
92
+ for rid in metadata.missing_resources:
93
+ tasks.append(asyncio.create_task(backup_resource(context, backup_id, kbid, rid)))
94
+ backing_up.append(rid)
95
+ if len(tasks) >= settings.backup_resources_concurrency:
96
+ resources_bytes = await asyncio.gather(*tasks)
97
+ metadata.total_size += sum(resources_bytes)
98
+ metadata.missing_resources = [
99
+ rid for rid in metadata.missing_resources if rid not in backing_up
100
+ ]
101
+ await set_metadata(context, kbid, backup_id, metadata)
102
+ tasks = []
103
+ backing_up = []
104
+ if len(tasks) > 0:
105
+ resources_bytes = await asyncio.gather(*tasks)
106
+ metadata.total_size += sum(resources_bytes)
107
+ metadata.missing_resources = [rid for rid in metadata.missing_resources if rid not in backing_up]
108
+ await set_metadata(context, kbid, backup_id, metadata)
109
+ tasks = []
110
+ backing_up = []
111
+
112
+
113
+ async def backup_resource(context: ApplicationContext, backup_id: str, kbid: str, rid: str) -> int:
114
+ """
115
+ Backs up a resource to the blob storage service.
116
+ Returns the size of the resource in bytes.
117
+ """
118
+ bm = await get_broker_message(context, kbid, rid)
119
+ if bm is None:
120
+ # Resource not found. May have been deleted while the backup was running.
121
+ return 0
122
+ return await backup_resource_with_binaries(context, backup_id, kbid, rid, bm)
123
+
124
+
125
+ async def to_tar(name: str, size: int, chunks: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
126
+ """
127
+ This function is a generator that adds tar header and padding to the end of the chunks
128
+ to be compatible with the tar format.
129
+ """
130
+ tarinfo = tarfile.TarInfo(name)
131
+ tarinfo.size = size
132
+ tarinfo.mtime = int(datetime.now().timestamp())
133
+ tarinfo.mode = 0o644
134
+ tarinfo.type = tarfile.REGTYPE
135
+ header_bytes = tarinfo.tobuf(format=tarfile.GNU_FORMAT)
136
+ yield header_bytes
137
+ async for chunk in chunks:
138
+ yield chunk
139
+ if size % 512 != 0:
140
+ yield b"\x00" * (512 - (size % 512))
141
+
142
+
143
+ async def backup_resource_with_binaries(
144
+ context, backup_id: str, kbid: str, rid: str, bm: writer_pb2.BrokerMessage
145
+ ) -> int:
146
+ """
147
+ Generate a tar file dynamically with the resource broker message and all its binary files,
148
+ and stream it to the blob storage service. Returns the total size of the tar file in bytes.
149
+ """
150
+ total_size = 0
151
+
152
+ async def resource_data_iterator():
153
+ """
154
+ Each tar file will have the following structure:
155
+
156
+ - cloud-files/{cloud_file.uri} (serialized resources_pb2.CloudFile)
157
+ - binaries/{cloud_file.uri} (the actual binary content of the cloud file)
158
+ - broker-message.pb
159
+
160
+ The order is important because the restore process depends on it (needs to import
161
+ the cloud files and its binaries first before the broker message).
162
+ """
163
+ nonlocal total_size
164
+
165
+ for cloud_file in get_cloud_files(bm):
166
+ serialized_cf = cloud_file.SerializeToString()
167
+
168
+ async def cf_iterator():
169
+ yield serialized_cf
170
+
171
+ async for chunk in to_tar(
172
+ name=f"cloud-files/{cloud_file.uri}", size=len(serialized_cf), chunks=cf_iterator()
173
+ ):
174
+ yield chunk
175
+ total_size += len(chunk)
176
+
177
+ async for chunk in to_tar(
178
+ name=f"binaries/{cloud_file.uri}",
179
+ size=cloud_file.size,
180
+ chunks=download_binary(context, cloud_file),
181
+ ):
182
+ yield chunk
183
+ total_size += len(chunk)
184
+
185
+ bm_serialized = bm.SerializeToString()
186
+
187
+ async def bm_iterator():
188
+ yield bm_serialized
189
+
190
+ async for chunk in to_tar(
191
+ name="broker-message.pb", size=len(bm_serialized), chunks=bm_iterator()
192
+ ):
193
+ yield chunk
194
+ total_size += len(chunk)
195
+
196
+ await upload_to_bucket(
197
+ context,
198
+ resource_data_iterator(),
199
+ key=StorageKeys.RESOURCE.format(kbid=kbid, backup_id=backup_id, resource_id=rid),
200
+ )
201
+ return total_size
202
+
203
+
204
+ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
205
+ labels = await get_labels(context, kbid)
206
+ await context.blob_storage.upload_object(
207
+ bucket=settings.backups_bucket,
208
+ key=StorageKeys.LABELS.format(kbid=kbid, backup_id=backup_id),
209
+ data=labels.SerializeToString(),
210
+ )
211
+
212
+
213
+ async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
214
+ entities = await get_entities(context, kbid)
215
+ await context.blob_storage.upload_object(
216
+ bucket=settings.backups_bucket,
217
+ key=StorageKeys.ENTITIES.format(kbid=kbid, backup_id=backup_id),
218
+ data=entities.SerializeToString(),
219
+ )
220
+
221
+
222
+ async def get_metadata(
223
+ context: ApplicationContext, kbid: str, backup_id: str
224
+ ) -> Optional[BackupMetadata]:
225
+ async with context.kv_driver.transaction(read_only=True) as txn:
226
+ metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
227
+ if metadata_raw is None:
228
+ return None
229
+ return BackupMetadata.model_validate_json(metadata_raw)
230
+
231
+
232
+ async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, metadata: BackupMetadata):
233
+ async with context.kv_driver.transaction() as txn:
234
+ await txn.set(
235
+ MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id),
236
+ metadata.model_dump_json().encode(),
237
+ )
238
+ await txn.commit()
239
+
240
+
241
+ async def delete_metadata(context: ApplicationContext, kbid: str, backup_id: str):
242
+ async with context.kv_driver.transaction() as txn:
243
+ await txn.delete(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
244
+ await txn.commit()
245
+
246
+
247
+ async def upload_to_bucket(context: ApplicationContext, bytes_iterator: AsyncIterator[bytes], key: str):
248
+ storage = context.blob_storage
249
+ bucket = settings.backups_bucket
250
+ cf = resources_pb2.CloudFile()
251
+ cf.bucket_name = bucket
252
+ cf.content_type = "binary/octet-stream"
253
+ cf.source = resources_pb2.CloudFile.Source.EXPORT
254
+ field: StorageField = storage.field_klass(storage=storage, bucket=bucket, fullkey=key, field=cf)
255
+ await storage.uploaditerator(bytes_iterator, field, cf)
256
+
257
+
258
+ async def notify_backup_completed(context: ApplicationContext, kbid: str, backup_id: str):
259
+ audit = get_audit()
260
+ if audit is None or not isinstance(audit, StreamAuditStorage):
261
+ # We rely on the stream audit utility as it already holds a connection
262
+ # to the idp nats server. If it's not available, we can't send the notification.
263
+ return
264
+ metadata = await get_metadata(context, kbid, backup_id)
265
+ if metadata is None: # pragma: no cover
266
+ raise ValueError("Backup metadata not found")
267
+ notification = backups_pb2.BackupCreatedNotification()
268
+ notification.finished_at.FromDatetime(datetime.now(tz=timezone.utc))
269
+ notification.kb_id = kbid
270
+ notification.backup_id = backup_id
271
+ notification.size = metadata.total_size
272
+ notification.resources = metadata.total_resources
273
+ await audit.js.publish(
274
+ BackupFinishedStream.subject,
275
+ notification.SerializeToString(),
276
+ stream=BackupFinishedStream.name,
277
+ )
@@ -0,0 +1,69 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ import asyncio
23
+
24
+ from nucliadb.backups.const import StorageKeys
25
+ from nucliadb.backups.models import DeleteBackupRequest
26
+ from nucliadb.backups.settings import settings
27
+ from nucliadb.common.context import ApplicationContext
28
+
29
+
30
+ async def delete_backup(context: ApplicationContext, msg: DeleteBackupRequest):
31
+ """
32
+ Deletes the backup files from the cloud storage.
33
+ """
34
+ backup_id = msg.backup_id
35
+ while True:
36
+ deleted = await delete_n(context, backup_id, n=1000)
37
+ if deleted == 0:
38
+ # No more objects to delete
39
+ break
40
+ await asyncio.sleep(1)
41
+
42
+
43
+ async def delete_n(context: ApplicationContext, backup_id: str, n: int):
44
+ concurrent_batch_size = 50
45
+ deleted = 0
46
+ tasks = []
47
+ async for object_info in context.blob_storage.iterate_objects(
48
+ bucket=settings.backups_bucket,
49
+ prefix=StorageKeys.BACKUP_PREFIX.format(backup_id=backup_id),
50
+ ):
51
+ if deleted >= n:
52
+ # Deleted enough objects
53
+ break
54
+ tasks.append(
55
+ asyncio.create_task(
56
+ context.blob_storage.delete_upload(
57
+ uri=object_info.name,
58
+ bucket_name=settings.backups_bucket,
59
+ )
60
+ )
61
+ )
62
+ deleted += 1
63
+ if len(tasks) > concurrent_batch_size:
64
+ await asyncio.gather(*tasks)
65
+ tasks = []
66
+ if len(tasks) > 0:
67
+ await asyncio.gather(*tasks)
68
+ tasks = []
69
+ return deleted
@@ -0,0 +1,44 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from datetime import datetime
21
+
22
+ from pydantic import BaseModel
23
+
24
+
25
+ class CreateBackupRequest(BaseModel):
26
+ kbid: str
27
+ backup_id: str
28
+
29
+
30
+ class RestoreBackupRequest(BaseModel):
31
+ kbid: str
32
+ backup_id: str
33
+
34
+
35
+ class DeleteBackupRequest(BaseModel):
36
+ backup_id: str
37
+
38
+
39
+ class BackupMetadata(BaseModel):
40
+ kbid: str
41
+ requested_at: datetime
42
+ total_resources: int = 0
43
+ missing_resources: list[str] = []
44
+ total_size: int = 0