pyresumable 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2025, UiO - University of Oslo
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyresumable
3
+ Version: 0.0.2
4
+ Summary: A library for creating files chunk-by-chunk
5
+ License: BSD-3-Clause
6
+ License-File: LICENSE
7
+ Author: Leon du Toit
8
+ Author-email: l.c.d.toit@usit.uio.no
9
+ Requires-Python: >=3.10,<4.0
10
+ Classifier: License :: OSI Approved :: BSD License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Requires-Dist: sqlalchemy (>=1.4.22,<2)
18
+ Project-URL: Repository, https://github.com/unioslo/pyresumable
19
+ Description-Content-Type: text/markdown
20
+
21
+ # pyresumable
22
+ A library for creating files chunk-by-chunk
23
+
@@ -0,0 +1,2 @@
1
+ # pyresumable
2
+ A library for creating files chunk-by-chunk
@@ -0,0 +1,24 @@
1
+ [tool.poetry]
2
+ name = "pyresumable"
3
+ version = "0.0.2"
4
+ description = "A library for creating files chunk-by-chunk"
5
+ authors = [
6
+ "Leon du Toit <l.c.d.toit@usit.uio.no>",
7
+ ]
8
+ license = "BSD-3-Clause"
9
+ readme = "README.md"
10
+ repository = "https://github.com/unioslo/pyresumable"
11
+ packages = [{include = "pyresumable"}]
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.10"
15
+ sqlalchemy = ">=1.4.22,<2"
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ pytest = "^7.1.3"
19
+ debugpy = "^1.6.3"
20
+ debugpy-run = "^1.4"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core>=1.0.0"]
24
+ build-backend = "poetry.core.masonry.api"
File without changes
@@ -0,0 +1,897 @@
1
+ import functools
2
+ import hashlib
3
+ import io
4
+ import logging
5
+ import os
6
+ import re
7
+ import shutil
8
+ import sqlite3
9
+ import stat
10
+ import uuid
11
+ from abc import ABC
12
+ from abc import abstractmethod
13
+ from contextlib import contextmanager
14
+ from typing import ContextManager
15
+ from typing import Union
16
+
17
+ import sqlalchemy
18
+ from sqlalchemy import create_engine
19
+ from sqlalchemy.exc import IntegrityError
20
+ from sqlalchemy.exc import OperationalError
21
+ from sqlalchemy.exc import StatementError
22
+ from sqlalchemy.orm import sessionmaker
23
+ from sqlalchemy.pool import QueuePool
24
+
25
+ _IS_VALID_UUID = re.compile(r"([a-f\d0-9-]{32,36})")
26
+ _RW______ = stat.S_IREAD | stat.S_IWRITE
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class ResumableNotFoundError(Exception):
32
+ pass
33
+
34
+
35
+ class ResumableIncorrectChunkOrderError(Exception):
36
+ pass
37
+
38
+
39
+ def _atoi(text: str) -> Union[int, str]:
40
+ return int(text) if text.isdigit() else text
41
+
42
+
43
+ def _natural_keys(text: str) -> list:
44
+ """
45
+ alist.sort(key=_natural_keys) sorts in human order
46
+ http://nedbatchelder.com/blog/200712/human_sorting.html
47
+ """
48
+ return [_atoi(c) for c in re.split(r"(\d+)", text)]
49
+
50
+
51
+ def _resumables_cmp(a: tuple, b: tuple) -> int:
52
+ a_time = a[0]
53
+ b_time = b[0]
54
+ if a_time > b_time:
55
+ return -1
56
+ elif a_time < b_time:
57
+ return 1
58
+ else:
59
+ return 1
60
+
61
+
62
+ def db_init(
63
+ path: str,
64
+ name: str = "api-data.db",
65
+ builtin: bool = False,
66
+ ) -> Union[sqlalchemy.engine.Engine, sqlite3.Connection]:
67
+ dbname = name
68
+ if not builtin:
69
+ dburl = "sqlite:///" + path + "/" + dbname
70
+ engine = create_engine(dburl, poolclass=QueuePool)
71
+ else:
72
+ engine = sqlite3.connect(path + "/" + dbname)
73
+ return engine
74
+
75
+
76
+ @contextmanager
77
+ def session_scope(
78
+ engine: sqlalchemy.engine.Engine,
79
+ ) -> ContextManager[sqlalchemy.orm.session.Session]:
80
+ Session = sessionmaker(bind=engine)
81
+ session = Session()
82
+ try:
83
+ yield session
84
+ session.commit()
85
+ except (OperationalError, IntegrityError, StatementError) as e:
86
+ session.rollback()
87
+ raise e
88
+ finally:
89
+ session.close()
90
+
91
+
92
+ def md5sum(filename: str, blocksize: int = 65536) -> str:
93
+ _hash = hashlib.md5()
94
+ with open(filename, "rb") as f:
95
+ for block in iter(lambda: f.read(blocksize), b""):
96
+ _hash.update(block)
97
+ return _hash.hexdigest()
98
+
99
+
100
+ class AbstractResumable(ABC):
101
+ def __init__(self, work_dir: str = None, owner: str = None) -> None:
102
+ super().__init__()
103
+ self.work_dir = work_dir
104
+ self.owner = owner
105
+
106
+ @abstractmethod
107
+ def prepare(
108
+ self,
109
+ work_dir: str,
110
+ in_filename: str,
111
+ url_chunk_num: str,
112
+ url_upload_id: str,
113
+ url_group: str,
114
+ owner: str,
115
+ key: str = None,
116
+ ) -> tuple:
117
+ raise NotImplementedError
118
+
119
+ @abstractmethod
120
+ def open_file(self, filename: str, mode: str) -> io.BufferedRandom:
121
+ raise NotImplementedError
122
+
123
+ @abstractmethod
124
+ def add_chunk(self, fd: io.BufferedRandom, chunk: bytes) -> None:
125
+ raise NotImplementedError
126
+
127
+ @abstractmethod
128
+ def close_file(self, filename: str) -> None:
129
+ raise NotImplementedError
130
+
131
+ @abstractmethod
132
+ def merge_chunk(
133
+ self,
134
+ work_dir: str,
135
+ last_chunk_filename: str,
136
+ upload_id: str,
137
+ owner: str,
138
+ ) -> str:
139
+ raise NotImplementedError
140
+
141
+ @abstractmethod
142
+ def finalise(
143
+ self,
144
+ work_dir: str,
145
+ last_chunk_filename: str,
146
+ upload_id: str,
147
+ owner: str,
148
+ ) -> str:
149
+ raise NotImplementedError
150
+
151
+ @abstractmethod
152
+ def list_all(
153
+ self,
154
+ work_dir: str,
155
+ owner: str,
156
+ key: str = None,
157
+ ) -> dict:
158
+ raise NotImplementedError
159
+
160
+ @abstractmethod
161
+ def info(
162
+ self,
163
+ work_dir: str,
164
+ filename: str,
165
+ upload_id: str,
166
+ owner: str,
167
+ key: str = None,
168
+ ) -> dict:
169
+ raise NotImplementedError
170
+
171
+ @abstractmethod
172
+ def delete(
173
+ self,
174
+ work_dir: str,
175
+ filename: str,
176
+ upload_id: str,
177
+ owner: str,
178
+ ) -> bool:
179
+ raise NotImplementedError
180
+
181
+
182
+ class SerialResumable(AbstractResumable):
183
+ """
184
+ Class for creating files in a piecemeal fashion,
185
+ useful for resumable uploads, for example.
186
+
187
+ The following public methods are exposed:
188
+
189
+ a) for creating files incrementally:
190
+
191
+ prepare
192
+ open_file
193
+ add_chunk
194
+ close_file
195
+ merge_chunk
196
+ finalise
197
+
198
+ b) for managing files which are still being finalised:
199
+
200
+ list_all
201
+ info
202
+ delete
203
+
204
+ """
205
+
206
+ def __init__(self, work_dir: str = None, owner: str = None) -> None:
207
+ super().__init__(work_dir, owner)
208
+ self.work_dir = work_dir
209
+ self.owner = owner
210
+ self.engine = self._init_db(owner, work_dir)
211
+
212
+ def _init_db(
213
+ self,
214
+ owner: str,
215
+ work_dir: str,
216
+ ) -> Union[sqlalchemy.engine.Engine, sqlite3.Connection]:
217
+ dbname = "{}{}{}".format(".resumables-", owner, ".db")
218
+ rdb = db_init(work_dir, name=dbname)
219
+ db_path = f"{work_dir}/{dbname}"
220
+ if os.path.lexists(db_path):
221
+ os.chmod(db_path, _RW______)
222
+ return rdb
223
+
224
+ def prepare(
225
+ self,
226
+ work_dir: str,
227
+ in_filename: str,
228
+ url_chunk_num: str,
229
+ url_upload_id: str,
230
+ url_group: str,
231
+ owner: str,
232
+ key: str = None,
233
+ ) -> tuple:
234
+ """
235
+ The following cases are handled:
236
+
237
+ 1. First chunk
238
+ - check that the chunk has not already been uploaded
239
+ - a new upload id is generated
240
+ - the upload id is recorded as beloning to the authenticated owner
241
+ - a new working directory is created
242
+ - set completed_resumable_file to None
243
+
244
+ 2. Rest of the chunks
245
+ - ensure monotonically increasing chunk order
246
+ - set completed_resumable_file to None
247
+
248
+ 3. End request
249
+ - set completed_resumable_file to True
250
+
251
+ In all cases the function returns:
252
+ upload_id/filename.extention.chunk.num
253
+
254
+ """
255
+ chunk_num = int(url_chunk_num) if url_chunk_num != "end" else url_chunk_num
256
+ upload_id = str(uuid.uuid4()) if not url_upload_id else url_upload_id
257
+ chunk_filename = in_filename + ".chunk." + url_chunk_num
258
+ filename = upload_id + "/" + chunk_filename
259
+ if chunk_num == "end":
260
+ completed_resumable_file = True
261
+ chunk_order_correct = True
262
+ elif chunk_num == 1:
263
+ os.makedirs(work_dir + "/" + upload_id)
264
+ assert self._db_insert_new_for_owner(upload_id, url_group, key=key)
265
+ chunk_order_correct = True
266
+ completed_resumable_file = None
267
+ elif chunk_num > 1:
268
+ chunk_order_correct = self._refuse_upload_if_not_in_sequential_order(
269
+ work_dir, upload_id, chunk_num
270
+ )
271
+ completed_resumable_file = None
272
+ return (
273
+ chunk_num,
274
+ upload_id,
275
+ completed_resumable_file,
276
+ chunk_order_correct,
277
+ filename,
278
+ )
279
+
280
+ def open_file(self, filename: str, mode: str) -> io.BufferedRandom:
281
+ fd = open(filename, mode)
282
+ os.chmod(filename, _RW______)
283
+ return fd
284
+
285
+ def add_chunk(self, fd: io.BufferedRandom, chunk: bytes) -> None:
286
+ if not fd:
287
+ return
288
+ else:
289
+ fd.write(chunk)
290
+
291
+ def close_file(self, fd: str) -> None:
292
+ if fd:
293
+ fd.close()
294
+
295
+ def _refuse_upload_if_not_in_sequential_order(
296
+ self,
297
+ work_dir: str,
298
+ upload_id: str,
299
+ chunk_num: int,
300
+ ) -> bool:
301
+ chunk_order_correct = True
302
+ full_chunks_on_disk = self._get_full_chunks_on_disk(work_dir, upload_id)
303
+ previous_chunk_num = int(full_chunks_on_disk[-1].split(".chunk.")[-1])
304
+ if chunk_num <= previous_chunk_num or (chunk_num - previous_chunk_num) >= 2:
305
+ chunk_order_correct = False
306
+ logger.error("chunks must be uploaded in sequential order")
307
+ return chunk_order_correct
308
+
309
+ def _find_nth_chunk(
310
+ self,
311
+ work_dir: str,
312
+ upload_id: str,
313
+ filename: str,
314
+ n: int,
315
+ ) -> str:
316
+ n = n - 1 # chunk numbers start at 1, but keep 0-based for the signaure
317
+ current_resumable = f"{work_dir}/{upload_id}"
318
+ files = os.listdir(current_resumable)
319
+ files.sort(key=_natural_keys)
320
+ completed_chunks = [f for f in files if ".part" not in f]
321
+ out = completed_chunks[n] if completed_chunks else ""
322
+ return out
323
+
324
+ def _find_relevant_resumable_dir(
325
+ self,
326
+ work_dir: str,
327
+ filename: str,
328
+ upload_id: str,
329
+ key: str = None,
330
+ ) -> str:
331
+ """
332
+ If the client provides an upload_id, then the exact folder is returned.
333
+ If no upload_id is provided, e.g. when the upload_id is lost, then
334
+ the server will try to find a match, based on the filename, returning
335
+ the most recent entry.
336
+
337
+ Returns
338
+ -------
339
+ str, upload_id (name of the directory)
340
+
341
+ """
342
+ relevant = None
343
+ potential_resumables = self._db_get_all_resumable_ids_for_owner(key=key)
344
+ if not upload_id:
345
+ logger.info("Trying to find a matching resumable for %s", filename)
346
+ candidates = []
347
+ for item in potential_resumables:
348
+ pr = item[0]
349
+ current_pr = f"{work_dir}/{pr}"
350
+ if _IS_VALID_UUID.match(pr) and os.path.lexists(current_pr):
351
+ candidates.append((os.stat(current_pr).st_mtime, pr))
352
+ candidates = sorted(candidates, key=functools.cmp_to_key(_resumables_cmp))
353
+ for cand in candidates:
354
+ upload_id = cand[1]
355
+ first_chunk = self._find_nth_chunk(work_dir, upload_id, filename, 1)
356
+ if filename in first_chunk:
357
+ relevant = cand[1]
358
+ break
359
+ else:
360
+ for item in potential_resumables:
361
+ pr = item[0]
362
+ current_pr = f"{work_dir}/{pr}"
363
+ if _IS_VALID_UUID.match(pr) and str(upload_id) == str(pr):
364
+ relevant = pr
365
+ return relevant
366
+
367
+ def list_all(
368
+ self,
369
+ work_dir: str,
370
+ owner: str,
371
+ key: str = None,
372
+ ) -> dict:
373
+ potential_resumables = self._db_get_all_resumable_ids_for_owner(key=key)
374
+ info = []
375
+ for item in potential_resumables:
376
+ chunk_size = None
377
+ pr = item[0]
378
+ current_pr = f"{work_dir}/{pr}"
379
+ if _IS_VALID_UUID.match(pr):
380
+ try:
381
+ (
382
+ chunk_size,
383
+ max_chunk,
384
+ md5sum,
385
+ previous_offset,
386
+ next_offset,
387
+ warning,
388
+ recommendation,
389
+ filename,
390
+ ) = self._get_resumable_chunk_info(current_pr, work_dir)
391
+ if recommendation == "end":
392
+ next_offset = "end"
393
+ except (OSError, Exception):
394
+ pass
395
+ if chunk_size:
396
+ try:
397
+ group = self._db_get_group(pr)
398
+ except Exception:
399
+ group = None
400
+ key = None
401
+ try:
402
+ key = self._db_get_key(pr)
403
+ except Exception: # for transition
404
+ pass
405
+ info.append(
406
+ {
407
+ "chunk_size": chunk_size,
408
+ "max_chunk": max_chunk,
409
+ "md5sum": md5sum,
410
+ "previous_offset": previous_offset,
411
+ "next_offset": next_offset,
412
+ "id": pr,
413
+ "filename": filename,
414
+ "group": group,
415
+ "key": key,
416
+ }
417
+ )
418
+ return {"resumables": info}
419
+
420
+ def _repair_inconsistent_resumable(
421
+ self,
422
+ merged_file: str,
423
+ chunks: list,
424
+ merged_file_size: int,
425
+ sum_chunks_size: int,
426
+ ) -> tuple:
427
+ """
428
+ If the server process crashed after a chunk was uploaded,
429
+ but while a merge was taking place, it is likey that
430
+ the merged file will be smaller than the sum of the chunks.
431
+
432
+ In that case, we try to re-merge the last chunk into the file
433
+ and return the resumable info after that. If the merged file
434
+ is _larger_ than the sum of the chunks, then a merge has taken
435
+ place more than once, and it is best for the client to either
436
+ end or delete the upload. If nothing can be done then the client
437
+ is encouraged to end the upload.
438
+
439
+ """
440
+ logger.info(
441
+ "current merged file size: %d, current sum of chunks in db %d",
442
+ merged_file_size,
443
+ sum_chunks_size,
444
+ )
445
+ if len(chunks) == 0:
446
+ return False
447
+ else:
448
+ last_chunk = chunks[-1]
449
+ last_chunk_size = os.stat(last_chunk).st_size
450
+ if merged_file_size == sum_chunks_size:
451
+ logger.info("server-side data consistent")
452
+ return chunks
453
+ try:
454
+ warning = None
455
+ recommendation = None
456
+ diff = sum_chunks_size - merged_file_size
457
+ if (merged_file_size < sum_chunks_size) and (diff <= last_chunk_size):
458
+ target_size = sum_chunks_size - last_chunk_size
459
+ with open(merged_file, "ab") as f:
460
+ f.truncate(target_size)
461
+ with open(merged_file, "ab") as fout:
462
+ with open(last_chunk, "rb") as fin:
463
+ shutil.copyfileobj(fin, fout)
464
+ new_merged_size = os.stat(merged_file).st_size
465
+ logger.info(
466
+ "merged file after repair: %d sum of chunks: %d",
467
+ new_merged_size,
468
+ sum_chunks_size,
469
+ )
470
+ if new_merged_size == sum_chunks_size:
471
+ return chunks, warning, recommendation
472
+ else:
473
+ raise Exception("could not repair data")
474
+ except (Exception, OSError) as e:
475
+ logger.error(e)
476
+ return chunks, "not sure what to do", "end"
477
+
478
+ def _get_resumable_chunk_info(self, resumable_dir: str, work_dir: str) -> tuple:
479
+ """
480
+ Get information needed to resume an upload.
481
+ If the server-side data is inconsistent, then
482
+ we try to fix it by successively dropping the last
483
+ chunk and truncating the merged file.
484
+
485
+ Returns
486
+ -------
487
+ tuple, (size, chunknum, md5sum, previous_offset, next_offset, key)
488
+
489
+ """
490
+
491
+ def info(
492
+ chunks: list, recommendation: str = None, warning: str = None
493
+ ) -> tuple:
494
+ num = int(chunks[-1].split(".")[-1])
495
+ latest_size = _bytes(chunks[-1])
496
+ upload_id = os.path.basename(resumable_dir)
497
+ next_offset = self._db_get_total_size(upload_id)
498
+ previous_offset = next_offset - latest_size
499
+ filename = os.path.basename(chunks[-1].split(".chunk")[0])
500
+ merged_file = os.path.normpath(work_dir + "/" + filename + "." + upload_id)
501
+ try:
502
+ # check that the size of the merge file
503
+ # matches what we calculate from the
504
+ # chunks recorded in the resumable db
505
+ assert _bytes(merged_file) == next_offset
506
+ except AssertionError:
507
+ try:
508
+ logger.info("trying to repair inconsistent data")
509
+ (
510
+ chunks,
511
+ warning,
512
+ recommendation,
513
+ ) = self._repair_inconsistent_resumable(
514
+ merged_file,
515
+ chunks,
516
+ _bytes(merged_file),
517
+ next_offset,
518
+ )
519
+ return info(chunks)
520
+ except Exception as e:
521
+ logger.error(e)
522
+ return None, None, None, None, None, None, None, None
523
+ return (
524
+ latest_size,
525
+ num,
526
+ md5sum(chunks[-1]),
527
+ previous_offset,
528
+ next_offset,
529
+ recommendation,
530
+ warning,
531
+ filename,
532
+ )
533
+
534
+ def _bytes(chunk: str) -> int:
535
+ size = os.stat(chunk).st_size
536
+ return size
537
+
538
+ # may contain partial files, due to failed requests
539
+ all_chunks = [f"{resumable_dir}/{i}" for i in os.listdir(resumable_dir)]
540
+ all_chunks.sort(key=_natural_keys)
541
+ chunks = [c for c in all_chunks if ".part" not in c]
542
+ return info(chunks)
543
+
544
+ def info(
545
+ self,
546
+ work_dir: str,
547
+ filename: str,
548
+ upload_id: str,
549
+ owner: str,
550
+ key: str = None,
551
+ ) -> dict:
552
+ relevant_dir = self._find_relevant_resumable_dir(
553
+ work_dir, filename, upload_id, key=key
554
+ )
555
+ if not relevant_dir:
556
+ logger.error("No resumable found for: %s", filename)
557
+ raise ResumableNotFoundError
558
+ resumable_dir = f"{work_dir}/{relevant_dir}"
559
+ (
560
+ chunk_size,
561
+ max_chunk,
562
+ md5sum,
563
+ previous_offset,
564
+ next_offset,
565
+ warning,
566
+ recommendation,
567
+ filename,
568
+ ) = self._get_resumable_chunk_info(resumable_dir, work_dir)
569
+ identifier = upload_id if upload_id else relevant_dir
570
+ try:
571
+ group = self._db_get_group(identifier)
572
+ except Exception:
573
+ group = None
574
+ try:
575
+ key = self._db_get_key(identifier)
576
+ except Exception: # for transition
577
+ key = None
578
+ if recommendation == "end":
579
+ next_offset = "end"
580
+ info = {
581
+ "filename": filename,
582
+ "id": relevant_dir,
583
+ "chunk_size": chunk_size,
584
+ "max_chunk": max_chunk,
585
+ "md5sum": md5sum,
586
+ "previous_offset": previous_offset,
587
+ "next_offset": next_offset,
588
+ "warning": warning,
589
+ "group": group,
590
+ "key": key,
591
+ }
592
+ return info
593
+
594
+ def _get_full_chunks_on_disk(self, work_dir: str, upload_id: str) -> list:
595
+ chunks_on_disk = os.listdir(work_dir + "/" + upload_id)
596
+ chunks_on_disk.sort(key=_natural_keys)
597
+ full_chunks_on_disk = [
598
+ c for c in chunks_on_disk if (".part" not in c and ".chunk" in c)
599
+ ]
600
+ return full_chunks_on_disk
601
+
602
+ def delete(
603
+ self,
604
+ work_dir: str,
605
+ filename: str,
606
+ upload_id: str,
607
+ owner: str,
608
+ ) -> bool:
609
+ try:
610
+ assert self._db_upload_belongs_to_owner(upload_id), (
611
+ "upload does not belong to user"
612
+ )
613
+ relevant_dir = work_dir + "/" + upload_id
614
+ relevant_merged_file = work_dir + "/" + filename + "." + upload_id
615
+ shutil.rmtree(relevant_dir)
616
+ os.remove(relevant_merged_file)
617
+ assert self._db_remove_completed_for_owner(upload_id), (
618
+ "could not remove data from resumables db"
619
+ )
620
+ return True
621
+ except (Exception, AssertionError) as e:
622
+ logger.error(e)
623
+ logger.error("could not complete resumable deletion")
624
+ return False
625
+
626
+ def finalise(
627
+ self,
628
+ work_dir: str,
629
+ last_chunk_filename: str,
630
+ upload_id: str,
631
+ owner: str,
632
+ ) -> str:
633
+ assert ".part" not in last_chunk_filename
634
+ filename = os.path.basename(last_chunk_filename.split(".chunk")[0])
635
+ out = os.path.normpath(work_dir + "/" + filename + "." + upload_id)
636
+ final = out.replace("." + upload_id, "")
637
+ chunks_dir = work_dir + "/" + upload_id
638
+ if ".chunk.end" in last_chunk_filename:
639
+ logger.info("deleting: %s", chunks_dir)
640
+ try:
641
+ os.rename(out, final)
642
+ except FileNotFoundError as e:
643
+ logger.error(e)
644
+ if not os.path.exists(out) and os.path.exists(final):
645
+ logger.warning(
646
+ f"resumable upload '{upload_id}' has already been moved to its final path '{final}'"
647
+ )
648
+ else:
649
+ raise ResumableNotFoundError
650
+ try:
651
+ shutil.rmtree(
652
+ chunks_dir
653
+ ) # do not need to fail upload if this does not work
654
+ except OSError as e:
655
+ logger.error(e)
656
+ assert self._db_remove_completed_for_owner(upload_id)
657
+ else:
658
+ logger.error("finalise called on non-end chunk")
659
+ return final
660
+
661
+ def merge_chunk(
662
+ self,
663
+ work_dir: str,
664
+ last_chunk_filename: str,
665
+ upload_id: str,
666
+ owner: str,
667
+ ) -> str:
668
+ """
669
+ Merge chunks into one file, _in order_.
670
+
671
+ Sequence
672
+ --------
673
+ 1. Check that the chunk is not partial
674
+ 2. If last request
675
+ - remove any remaining chunks, and the working directory
676
+ - continue to the chowner: move file, set permissions
677
+ 3. If new chunk
678
+ - if chunk_num > 1, create a lockfile - link to a unique file (NFS-safe method)
679
+ - append it to the merge file
680
+ - remove chunks older than 5 requests back in the sequence
681
+ to avoid using lots of disk space for very large files
682
+ - update the resumable's info table
683
+ 4. If a merge fails
684
+ - remove the chunk
685
+ - reset the file to its prior size
686
+ - end the request
687
+ 5. Finally
688
+ - unlink any existing lock
689
+
690
+ Note
691
+ ----
692
+ This will produce bizarre files if clients send chunks out of order,
693
+ which rules out multi-threaded senders. That can be supported by delaying
694
+ the merge until the final request. Until a feature request arrives,
695
+ it remain unimplemented.
696
+
697
+ Note
698
+ ----
699
+ If removing a chunk with the `os.remove(chunk)` call fails, we've got
700
+ ourselves a case of an invalid/stale chunk file. We can't remove the file,
701
+ obviously -- `os.remove` did fail, after all. We have now a mandatory
702
+ choice to make, between returning success and returning an error to the
703
+ client. Returning success would be dangerous because the error we're dealing
704
+ with that prompted us to try remove the chunk implies the chunk may be
705
+ invalid due to not having been written entirely or something else that went
706
+ wrong that caused the particular except block to apply. So we assume an
707
+ error must be returned to the client. We shouldn't return a _client_ error
708
+ code, because the client isn't at fault here -- something went wrong on our
709
+ side of things. Assuming thus that a _server_ error is returned, since a
710
+ client may rightfully assume some _intermittent_ server error condition,
711
+ they may be inclined to retry the request. The environment certainly helps
712
+ make this an attractive error handling strategy for us -- what with sporadic
713
+ NFS I/O issues and other things that may crop up from one request to the
714
+ next. However, since the removal of the [stale] chunk file would have failed
715
+ during the first request, the retried request will fail the second time, but
716
+ now with the API deeming it a _client_ error -- because the API assumes that
717
+ if a chunk file is present then the chunk already is stored and repeating
718
+ with the same chunk number is a clear case of chunk number order violation,
719
+ something the API would rightfully attribute as cause to the client.
720
+
721
+ """
722
+ assert ".part" not in last_chunk_filename
723
+ filename = os.path.basename(last_chunk_filename.split(".chunk")[0])
724
+ out = os.path.normpath(work_dir + "/" + filename + "." + upload_id)
725
+ out_lock = out + ".lock"
726
+ final = out.replace("." + upload_id, "")
727
+ chunks_dir = work_dir + "/" + upload_id
728
+ chunk_num = int(last_chunk_filename.split(".chunk.")[-1])
729
+ chunk = chunks_dir + "/" + last_chunk_filename
730
+ try:
731
+ if chunk_num > 1:
732
+ os.link(out, out_lock)
733
+ with open(out, "ab") as fout:
734
+ with open(chunk, "rb") as fin:
735
+ size_before_merge = os.stat(out).st_size
736
+ shutil.copyfileobj(fin, fout)
737
+ chunk_size = os.stat(chunk).st_size
738
+ assert self._db_update_with_chunk_info(upload_id, chunk_num, chunk_size)
739
+ except Exception as e:
740
+ logger.error(e)
741
+ os.remove(chunk)
742
+ with open(out, "ab") as fout:
743
+ fout.truncate(size_before_merge)
744
+ raise e
745
+ finally:
746
+ if chunk_num > 1:
747
+ try:
748
+ os.unlink(out_lock)
749
+ except Exception as e:
750
+ logging.exception(e)
751
+ if chunk_num >= 5:
752
+ target_chunk_num = chunk_num - 4
753
+ old_chunk = chunk.replace(
754
+ ".chunk." + str(chunk_num), ".chunk." + str(target_chunk_num)
755
+ )
756
+ try:
757
+ os.remove(old_chunk)
758
+ except Exception as e:
759
+ logger.error(e)
760
+ return final
761
+
762
+ def _db_insert_new_for_owner(
763
+ self,
764
+ resumable_id: str,
765
+ group: str,
766
+ key: str = None,
767
+ ) -> bool:
768
+ """
769
+ A backwards incompatible change introduced the key column
770
+ to the resumable_uploads table. This is why existing tables'
771
+ columns are altered.
772
+ """
773
+ resumable_accounting_table = "resumable_uploads"
774
+ resumable_table = f"resumable_{resumable_id}"
775
+ with session_scope(self.engine) as session:
776
+ resumables_table_exists = False
777
+ current_tables = session.execute(
778
+ "select name FROM sqlite_master where type = 'table'"
779
+ ).fetchall()
780
+ if len(current_tables) == 0:
781
+ pass
782
+ else:
783
+ for table in current_tables:
784
+ if resumable_accounting_table in table[0]:
785
+ resumables_table_exists = True
786
+ break
787
+ if not resumables_table_exists:
788
+ session.execute(
789
+ f"""
790
+ create table if not exists {resumable_accounting_table}(
791
+ id text,
792
+ upload_group text,
793
+ key text
794
+ )"""
795
+ )
796
+ else:
797
+ try:
798
+ session.execute(
799
+ f"""
800
+ alter table {resumable_accounting_table} add column key text"""
801
+ )
802
+ except OperationalError:
803
+ pass # ^ already altered the table before
804
+ session.execute(
805
+ f"""
806
+ insert into {resumable_accounting_table} (id, upload_group, key)
807
+ values (:resumable_id, :upload_group, :key)""",
808
+ {"resumable_id": resumable_id, "upload_group": group, "key": key},
809
+ )
810
+ session.execute(
811
+ f"""
812
+ create table "{resumable_table}"(chunk_num int, chunk_size int)"""
813
+ )
814
+ return True
815
+
816
+ def _db_update_with_chunk_info(
817
+ self,
818
+ resumable_id: str,
819
+ chunk_num: int,
820
+ chunk_size: int,
821
+ ) -> bool:
822
+ resumable_table = f"resumable_{resumable_id}"
823
+ with session_scope(self.engine) as session:
824
+ session.execute(
825
+ f"""
826
+ insert into "{resumable_table}"(chunk_num, chunk_size)
827
+ values (:chunk_num, :chunk_size)""",
828
+ {"chunk_num": chunk_num, "chunk_size": chunk_size},
829
+ )
830
+ return True
831
+
832
+ def _db_pop_chunk(self, resumable_id: str, chunk_num: int) -> bool:
833
+ resumable_table = f"resumable_{resumable_id}"
834
+ with session_scope(self.engine) as session:
835
+ session.execute(
836
+ f"""
837
+ delete from "{resumable_table}"
838
+ where chunk_num = :chunk_num""",
839
+ {"chunk_num": chunk_num},
840
+ )
841
+ return True
842
+
843
+ def _db_get_total_size(self, resumable_id: str) -> int:
844
+ resumable_table = f"resumable_{resumable_id}"
845
+ with session_scope(self.engine) as session:
846
+ res = session.execute(
847
+ f'select sum(chunk_size) from "{resumable_table}"'
848
+ ).fetchone()[0]
849
+ return res
850
+
851
+ def _db_get_group(self, resumable_id: str) -> str:
852
+ with session_scope(self.engine) as session:
853
+ res = session.execute(
854
+ "select upload_group from resumable_uploads where id = :resumable_id",
855
+ {"resumable_id": resumable_id},
856
+ ).fetchone()[0]
857
+ return res
858
+
859
+ def _db_get_key(self, resumable_id: str) -> str:
860
+ with session_scope(self.engine) as session:
861
+ res = session.execute(
862
+ "select key from resumable_uploads where id = :resumable_id",
863
+ {"resumable_id": resumable_id},
864
+ ).fetchone()[0]
865
+ return res
866
+
867
+ def _db_upload_belongs_to_owner(self, resumable_id: str) -> bool:
868
+ with session_scope(self.engine) as session:
869
+ res = session.execute(
870
+ "select count(1) from resumable_uploads where id = :resumable_id",
871
+ {"resumable_id": resumable_id},
872
+ ).fetchone()[0]
873
+ return True if res > 0 else False
874
+
875
+ def _db_get_all_resumable_ids_for_owner(self, key: str = None) -> list:
876
+ try:
877
+ params = {}
878
+ if key:
879
+ query = "select id from resumable_uploads where key = :key"
880
+ params["key"] = key
881
+ else:
882
+ query = "select id from resumable_uploads"
883
+ with session_scope(self.engine) as session:
884
+ res = session.execute(query, params).fetchall()
885
+ except Exception:
886
+ return []
887
+ return res # [(id,), (id,)]
888
+
889
+ def _db_remove_completed_for_owner(self, resumable_id: str) -> bool:
890
+ resumable_table = f"resumable_{resumable_id}"
891
+ with session_scope(self.engine) as session:
892
+ session.execute(
893
+ "delete from resumable_uploads where id = :resumable_id",
894
+ {"resumable_id": resumable_id},
895
+ )
896
+ session.execute(f'drop table "{resumable_table}"')
897
+ return True