databricks-sdk 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of databricks-sdk might be problematic. Click here for more details.

Files changed (63) hide show
  1. databricks/sdk/__init__.py +123 -115
  2. databricks/sdk/_base_client.py +112 -88
  3. databricks/sdk/_property.py +12 -7
  4. databricks/sdk/_widgets/__init__.py +13 -2
  5. databricks/sdk/_widgets/default_widgets_utils.py +21 -15
  6. databricks/sdk/_widgets/ipywidgets_utils.py +47 -24
  7. databricks/sdk/azure.py +8 -6
  8. databricks/sdk/casing.py +5 -5
  9. databricks/sdk/config.py +152 -99
  10. databricks/sdk/core.py +57 -47
  11. databricks/sdk/credentials_provider.py +360 -210
  12. databricks/sdk/data_plane.py +86 -3
  13. databricks/sdk/dbutils.py +123 -87
  14. databricks/sdk/environments.py +52 -35
  15. databricks/sdk/errors/base.py +61 -35
  16. databricks/sdk/errors/customizer.py +3 -3
  17. databricks/sdk/errors/deserializer.py +38 -25
  18. databricks/sdk/errors/details.py +417 -0
  19. databricks/sdk/errors/mapper.py +1 -1
  20. databricks/sdk/errors/overrides.py +27 -24
  21. databricks/sdk/errors/parser.py +26 -14
  22. databricks/sdk/errors/platform.py +10 -10
  23. databricks/sdk/errors/private_link.py +24 -24
  24. databricks/sdk/logger/round_trip_logger.py +28 -20
  25. databricks/sdk/mixins/compute.py +90 -60
  26. databricks/sdk/mixins/files.py +815 -145
  27. databricks/sdk/mixins/jobs.py +201 -20
  28. databricks/sdk/mixins/open_ai_client.py +26 -20
  29. databricks/sdk/mixins/workspace.py +45 -34
  30. databricks/sdk/oauth.py +372 -196
  31. databricks/sdk/retries.py +14 -12
  32. databricks/sdk/runtime/__init__.py +34 -17
  33. databricks/sdk/runtime/dbutils_stub.py +52 -39
  34. databricks/sdk/service/_internal.py +12 -7
  35. databricks/sdk/service/apps.py +618 -418
  36. databricks/sdk/service/billing.py +827 -604
  37. databricks/sdk/service/catalog.py +6552 -4474
  38. databricks/sdk/service/cleanrooms.py +550 -388
  39. databricks/sdk/service/compute.py +5241 -3531
  40. databricks/sdk/service/dashboards.py +1313 -923
  41. databricks/sdk/service/files.py +442 -309
  42. databricks/sdk/service/iam.py +2115 -1483
  43. databricks/sdk/service/jobs.py +4151 -2588
  44. databricks/sdk/service/marketplace.py +2210 -1517
  45. databricks/sdk/service/ml.py +3364 -2255
  46. databricks/sdk/service/oauth2.py +922 -584
  47. databricks/sdk/service/pipelines.py +1865 -1203
  48. databricks/sdk/service/provisioning.py +1435 -1029
  49. databricks/sdk/service/serving.py +2040 -1278
  50. databricks/sdk/service/settings.py +2846 -1929
  51. databricks/sdk/service/sharing.py +2201 -877
  52. databricks/sdk/service/sql.py +4650 -3103
  53. databricks/sdk/service/vectorsearch.py +816 -550
  54. databricks/sdk/service/workspace.py +1330 -906
  55. databricks/sdk/useragent.py +36 -22
  56. databricks/sdk/version.py +1 -1
  57. {databricks_sdk-0.44.0.dist-info → databricks_sdk-0.45.0.dist-info}/METADATA +31 -31
  58. databricks_sdk-0.45.0.dist-info/RECORD +70 -0
  59. {databricks_sdk-0.44.0.dist-info → databricks_sdk-0.45.0.dist-info}/WHEEL +1 -1
  60. databricks_sdk-0.44.0.dist-info/RECORD +0 -69
  61. {databricks_sdk-0.44.0.dist-info → databricks_sdk-0.45.0.dist-info}/LICENSE +0 -0
  62. {databricks_sdk-0.44.0.dist-info → databricks_sdk-0.45.0.dist-info}/NOTICE +0 -0
  63. {databricks_sdk-0.44.0.dist-info → databricks_sdk-0.45.0.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,36 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
+ import datetime
4
5
  import logging
5
6
  import os
6
7
  import pathlib
7
8
  import platform
9
+ import re
8
10
  import shutil
9
11
  import sys
12
+ import xml.etree.ElementTree as ET
10
13
  from abc import ABC, abstractmethod
11
14
  from collections import deque
12
15
  from collections.abc import Iterator
16
+ from datetime import timedelta
13
17
  from io import BytesIO
14
18
  from types import TracebackType
15
- from typing import (TYPE_CHECKING, AnyStr, BinaryIO, Generator, Iterable,
16
- Optional, Type, Union)
19
+ from typing import (TYPE_CHECKING, AnyStr, BinaryIO, Callable, Generator,
20
+ Iterable, Optional, Type, Union)
17
21
  from urllib import parse
18
22
 
23
+ import requests
24
+ import requests.adapters
19
25
  from requests import RequestException
20
26
 
21
- from .._base_client import _RawResponse, _StreamingResponse
27
+ from .._base_client import _BaseClient, _RawResponse, _StreamingResponse
22
28
  from .._property import _cached_property
23
- from ..errors import NotFound
29
+ from ..config import Config
30
+ from ..errors import AlreadyExists, NotFound
31
+ from ..errors.customizer import _RetryAfterCustomizer
32
+ from ..errors.mapper import _error_mapper
33
+ from ..retries import retried
24
34
  from ..service import files
25
35
  from ..service._internal import _escape_multi_segment_path_parameter
26
36
  from ..service.files import DownloadResponse
@@ -39,19 +49,25 @@ class _DbfsIO(BinaryIO):
39
49
  _offset = 0
40
50
  _closed = False
41
51
 
42
- def __init__(self,
43
- api: files.DbfsAPI,
44
- path: str,
45
- *,
46
- read: bool = False,
47
- write: bool = False,
48
- overwrite: bool = False):
52
+ def __init__(
53
+ self,
54
+ api: files.DbfsAPI,
55
+ path: str,
56
+ *,
57
+ read: bool = False,
58
+ write: bool = False,
59
+ overwrite: bool = False,
60
+ ):
49
61
  self._api = api
50
62
  self._path = path
51
- if write and read: raise IOError(f'can open either for reading or writing')
52
- if read: self._status = api.get_status(path)
53
- elif write: self._created = api.create(path, overwrite=overwrite)
54
- else: raise IOError(f'need to open either for reading or writing')
63
+ if write and read:
64
+ raise IOError(f"can open either for reading or writing")
65
+ if read:
66
+ self._status = api.get_status(path)
67
+ elif write:
68
+ self._created = api.create(path, overwrite=overwrite)
69
+ else:
70
+ raise IOError(f"need to open either for reading or writing")
55
71
 
56
72
  def __enter__(self) -> Self:
57
73
  return self
@@ -69,54 +85,59 @@ class _DbfsIO(BinaryIO):
69
85
  return self._created is not None
70
86
 
71
87
  def write(self, buffer: bytes) -> int:
72
- """ Write bytes to file.
88
+ """Write bytes to file.
73
89
 
74
90
  :return: Return the number of bytes written.
75
91
  """
76
92
  if not self.writable():
77
- raise IOError('file not open for writing')
93
+ raise IOError("file not open for writing")
78
94
  if type(buffer) is not bytes:
79
95
  # Python doesn't strictly enforce types. Even if they're specified.
80
- raise TypeError(f'a bytes-like object is required, not {type(buffer)}')
96
+ raise TypeError(f"a bytes-like object is required, not {type(buffer)}")
81
97
  total = 0
82
98
  while total < len(buffer):
83
99
  chunk = buffer[total:]
84
100
  if len(chunk) > self.MAX_CHUNK_SIZE:
85
- chunk = chunk[:self.MAX_CHUNK_SIZE]
101
+ chunk = chunk[: self.MAX_CHUNK_SIZE]
86
102
  encoded = base64.b64encode(chunk).decode()
87
103
  self._api.add_block(self._created.handle, encoded)
88
104
  total += len(chunk)
89
105
  return total
90
106
 
91
107
  def close(self) -> None:
92
- """ Disable all I/O operations. """
93
- if self.writable(): self._api.close(self._created.handle)
108
+ """Disable all I/O operations."""
109
+ if self.writable():
110
+ self._api.close(self._created.handle)
94
111
  self._closed = True
95
112
 
96
113
  @property
97
114
  def closed(self) -> bool:
98
115
  return self._closed
99
116
 
100
- def __exit__(self, __t: Type[BaseException] | None, __value: BaseException | None,
101
- __traceback: TracebackType | None):
117
+ def __exit__(
118
+ self,
119
+ __t: Type[BaseException] | None,
120
+ __value: BaseException | None,
121
+ __traceback: TracebackType | None,
122
+ ):
102
123
  self.close()
103
124
 
104
125
  def readable(self) -> bool:
105
126
  return self._status is not None
106
127
 
107
128
  def read(self, size: int = ...) -> bytes:
108
- """ Read at most size bytes, returned as a bytes object.
129
+ """Read at most size bytes, returned as a bytes object.
109
130
 
110
131
  :param size: If the size argument is negative, read until EOF is reached.
111
132
  Return an empty bytes object at EOF.
112
133
  :return: bytes
113
134
  """
114
135
  if not self.readable():
115
- raise IOError('file not open for reading')
136
+ raise IOError("file not open for reading")
116
137
 
117
138
  # call __iter__() and read until EOF is reached
118
139
  if size is ... or size < 0:
119
- buffer = b''
140
+ buffer = b""
120
141
  for chunk in self:
121
142
  buffer += chunk
122
143
  return buffer
@@ -128,7 +149,7 @@ class _DbfsIO(BinaryIO):
128
149
  if response.bytes_read == 0:
129
150
  # as per Python interface convention, return an empty bytes object at EOF,
130
151
  # and not the EOFError as in other SDKs
131
- return b''
152
+ return b""
132
153
 
133
154
  raw = base64.b64decode(response.data)
134
155
  self._offset += response.bytes_read
@@ -178,7 +199,15 @@ class _DbfsIO(BinaryIO):
178
199
 
179
200
  class _VolumesIO(BinaryIO):
180
201
 
181
- def __init__(self, api: files.FilesAPI, path: str, *, read: bool, write: bool, overwrite: bool):
202
+ def __init__(
203
+ self,
204
+ api: files.FilesAPI,
205
+ path: str,
206
+ *,
207
+ read: bool,
208
+ write: bool,
209
+ overwrite: bool,
210
+ ):
182
211
  self._buffer = []
183
212
  self._api = api
184
213
  self._path = path
@@ -198,8 +227,12 @@ class _VolumesIO(BinaryIO):
198
227
  if self._closed:
199
228
  return
200
229
  if self._write:
201
- to_write = b''.join(self._buffer)
202
- self._api.upload(self._path, contents=BytesIO(to_write), overwrite=self._overwrite)
230
+ to_write = b"".join(self._buffer)
231
+ self._api.upload(
232
+ self._path,
233
+ contents=BytesIO(to_write),
234
+ overwrite=self._overwrite,
235
+ )
203
236
  elif self._read:
204
237
  self._read_handle.close()
205
238
  self._closed = True
@@ -215,7 +248,7 @@ class _VolumesIO(BinaryIO):
215
248
 
216
249
  def __check_closed(self):
217
250
  if self._closed:
218
- raise ValueError('I/O operation on closed file')
251
+ raise ValueError("I/O operation on closed file")
219
252
 
220
253
  def __open_read(self):
221
254
  if self._read_handle is None:
@@ -277,55 +310,45 @@ class _VolumesIO(BinaryIO):
277
310
  class _Path(ABC):
278
311
 
279
312
  @abstractmethod
280
- def __init__(self):
281
- ...
313
+ def __init__(self): ...
282
314
 
283
315
  @property
284
316
  def is_local(self) -> bool:
285
317
  return self._is_local()
286
318
 
287
319
  @abstractmethod
288
- def _is_local(self) -> bool:
289
- ...
320
+ def _is_local(self) -> bool: ...
290
321
 
291
322
  @property
292
323
  def is_dbfs(self) -> bool:
293
324
  return self._is_dbfs()
294
325
 
295
326
  @abstractmethod
296
- def _is_dbfs(self) -> bool:
297
- ...
327
+ def _is_dbfs(self) -> bool: ...
298
328
 
299
329
  @abstractmethod
300
- def child(self, path: str) -> str:
301
- ...
330
+ def child(self, path: str) -> str: ...
302
331
 
303
332
  @_cached_property
304
333
  def is_dir(self) -> bool:
305
334
  return self._is_dir()
306
335
 
307
336
  @abstractmethod
308
- def _is_dir(self) -> bool:
309
- ...
337
+ def _is_dir(self) -> bool: ...
310
338
 
311
339
  @abstractmethod
312
- def exists(self) -> bool:
313
- ...
340
+ def exists(self) -> bool: ...
314
341
 
315
342
  @abstractmethod
316
- def open(self, *, read=False, write=False, overwrite=False):
317
- ...
343
+ def open(self, *, read=False, write=False, overwrite=False): ...
318
344
 
319
- def list(self, *, recursive=False) -> Generator[files.FileInfo, None, None]:
320
- ...
345
+ def list(self, *, recursive=False) -> Generator[files.FileInfo, None, None]: ...
321
346
 
322
347
  @abstractmethod
323
- def mkdir(self):
324
- ...
348
+ def mkdir(self): ...
325
349
 
326
350
  @abstractmethod
327
- def delete(self, *, recursive=False):
328
- ...
351
+ def delete(self, *, recursive=False): ...
329
352
 
330
353
  @property
331
354
  def name(self) -> str:
@@ -340,9 +363,9 @@ class _LocalPath(_Path):
340
363
 
341
364
  def __init__(self, path: str):
342
365
  if platform.system() == "Windows":
343
- self._path = pathlib.Path(str(path).replace('file:///', '').replace('file:', ''))
366
+ self._path = pathlib.Path(str(path).replace("file:///", "").replace("file:", ""))
344
367
  else:
345
- self._path = pathlib.Path(str(path).replace('file:', ''))
368
+ self._path = pathlib.Path(str(path).replace("file:", ""))
346
369
 
347
370
  def _is_local(self) -> bool:
348
371
  return True
@@ -365,16 +388,17 @@ class _LocalPath(_Path):
365
388
  def open(self, *, read=False, write=False, overwrite=False):
366
389
  # make local fs follow the similar semantics as DBFS
367
390
  self._path.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
368
- return self._path.open(mode='wb' if overwrite else 'rb' if read else 'xb')
391
+ return self._path.open(mode="wb" if overwrite else "rb" if read else "xb")
369
392
 
370
393
  def list(self, recursive=False) -> Generator[files.FileInfo, None, None]:
371
394
  if not self.is_dir:
372
395
  st = self._path.stat()
373
- yield files.FileInfo(path='file:' + str(self._path.absolute()),
374
- is_dir=False,
375
- file_size=st.st_size,
376
- modification_time=int(st.st_mtime_ns / 1e6),
377
- )
396
+ yield files.FileInfo(
397
+ path="file:" + str(self._path.absolute()),
398
+ is_dir=False,
399
+ file_size=st.st_size,
400
+ modification_time=int(st.st_mtime_ns / 1e6),
401
+ )
378
402
  return
379
403
  queue = deque([self._path])
380
404
  while queue:
@@ -385,11 +409,12 @@ class _LocalPath(_Path):
385
409
  queue.append(leaf)
386
410
  continue
387
411
  info = leaf.stat()
388
- yield files.FileInfo(path='file:' + str(leaf.absolute()),
389
- is_dir=False,
390
- file_size=info.st_size,
391
- modification_time=int(info.st_mtime_ns / 1e6),
392
- )
412
+ yield files.FileInfo(
413
+ path="file:" + str(leaf.absolute()),
414
+ is_dir=False,
415
+ file_size=info.st_size,
416
+ modification_time=int(info.st_mtime_ns / 1e6),
417
+ )
393
418
 
394
419
  def delete(self, *, recursive=False):
395
420
  if self.is_dir:
@@ -400,17 +425,17 @@ class _LocalPath(_Path):
400
425
  else:
401
426
  kw = {}
402
427
  if sys.version_info[:2] > (3, 7):
403
- kw['missing_ok'] = True
428
+ kw["missing_ok"] = True
404
429
  self._path.unlink(**kw)
405
430
 
406
431
  def __repr__(self) -> str:
407
- return f'<_LocalPath {self._path}>'
432
+ return f"<_LocalPath {self._path}>"
408
433
 
409
434
 
410
435
  class _VolumesPath(_Path):
411
436
 
412
437
  def __init__(self, api: files.FilesAPI, src: Union[str, pathlib.Path]):
413
- self._path = pathlib.PurePosixPath(str(src).replace('dbfs:', '').replace('file:', ''))
438
+ self._path = pathlib.PurePosixPath(str(src).replace("dbfs:", "").replace("file:", ""))
414
439
  self._api = api
415
440
 
416
441
  def _is_local(self) -> bool:
@@ -440,16 +465,23 @@ class _VolumesPath(_Path):
440
465
  return self.is_dir
441
466
 
442
467
  def open(self, *, read=False, write=False, overwrite=False) -> BinaryIO:
443
- return _VolumesIO(self._api, self.as_string, read=read, write=write, overwrite=overwrite)
468
+ return _VolumesIO(
469
+ self._api,
470
+ self.as_string,
471
+ read=read,
472
+ write=write,
473
+ overwrite=overwrite,
474
+ )
444
475
 
445
476
  def list(self, *, recursive=False) -> Generator[files.FileInfo, None, None]:
446
477
  if not self.is_dir:
447
478
  meta = self._api.get_metadata(self.as_string)
448
- yield files.FileInfo(path=self.as_string,
449
- is_dir=False,
450
- file_size=meta.content_length,
451
- modification_time=meta.last_modified,
452
- )
479
+ yield files.FileInfo(
480
+ path=self.as_string,
481
+ is_dir=False,
482
+ file_size=meta.content_length,
483
+ modification_time=meta.last_modified,
484
+ )
453
485
  return
454
486
  queue = deque([self])
455
487
  while queue:
@@ -458,11 +490,12 @@ class _VolumesPath(_Path):
458
490
  if recursive and file.is_directory:
459
491
  queue.append(self.child(file.name))
460
492
  if not recursive or not file.is_directory:
461
- yield files.FileInfo(path=file.path,
462
- is_dir=file.is_directory,
463
- file_size=file.file_size,
464
- modification_time=file.last_modified,
465
- )
493
+ yield files.FileInfo(
494
+ path=file.path,
495
+ is_dir=file.is_directory,
496
+ file_size=file.file_size,
497
+ modification_time=file.last_modified,
498
+ )
466
499
 
467
500
  def delete(self, *, recursive=False):
468
501
  if self.is_dir:
@@ -473,13 +506,13 @@ class _VolumesPath(_Path):
473
506
  self._api.delete(self.as_string)
474
507
 
475
508
  def __repr__(self) -> str:
476
- return f'<_VolumesPath {self._path}>'
509
+ return f"<_VolumesPath {self._path}>"
477
510
 
478
511
 
479
512
  class _DbfsPath(_Path):
480
513
 
481
514
  def __init__(self, api: files.DbfsAPI, src: str):
482
- self._path = pathlib.PurePosixPath(str(src).replace('dbfs:', '').replace('file:', ''))
515
+ self._path = pathlib.PurePosixPath(str(src).replace("dbfs:", "").replace("file:", ""))
483
516
  self._api = api
484
517
 
485
518
  def _is_local(self) -> bool:
@@ -510,16 +543,23 @@ class _DbfsPath(_Path):
510
543
  return False
511
544
 
512
545
  def open(self, *, read=False, write=False, overwrite=False) -> BinaryIO:
513
- return _DbfsIO(self._api, self.as_string, read=read, write=write, overwrite=overwrite)
546
+ return _DbfsIO(
547
+ self._api,
548
+ self.as_string,
549
+ read=read,
550
+ write=write,
551
+ overwrite=overwrite,
552
+ )
514
553
 
515
554
  def list(self, *, recursive=False) -> Generator[files.FileInfo, None, None]:
516
555
  if not self.is_dir:
517
556
  meta = self._api.get_status(self.as_string)
518
- yield files.FileInfo(path=self.as_string,
519
- is_dir=False,
520
- file_size=meta.file_size,
521
- modification_time=meta.modification_time,
522
- )
557
+ yield files.FileInfo(
558
+ path=self.as_string,
559
+ is_dir=False,
560
+ file_size=meta.file_size,
561
+ modification_time=meta.modification_time,
562
+ )
523
563
  return
524
564
  queue = deque([self])
525
565
  while queue:
@@ -534,7 +574,7 @@ class _DbfsPath(_Path):
534
574
  self._api.delete(self.as_string, recursive=recursive)
535
575
 
536
576
  def __repr__(self) -> str:
537
- return f'<_DbfsPath {self._path}>'
577
+ return f"<_DbfsPath {self._path}>"
538
578
 
539
579
 
540
580
  class DbfsExt(files.DbfsAPI):
@@ -545,12 +585,14 @@ class DbfsExt(files.DbfsAPI):
545
585
  self._files_api = files.FilesAPI(api_client)
546
586
  self._dbfs_api = files.DbfsAPI(api_client)
547
587
 
548
- def open(self,
549
- path: str,
550
- *,
551
- read: bool = False,
552
- write: bool = False,
553
- overwrite: bool = False) -> BinaryIO:
588
+ def open(
589
+ self,
590
+ path: str,
591
+ *,
592
+ read: bool = False,
593
+ write: bool = False,
594
+ overwrite: bool = False,
595
+ ) -> BinaryIO:
554
596
  return self._path(path).open(read=read, write=write, overwrite=overwrite)
555
597
 
556
598
  def upload(self, path: str, src: BinaryIO, *, overwrite: bool = False):
@@ -588,17 +630,18 @@ class DbfsExt(files.DbfsAPI):
588
630
  p = self._path(path)
589
631
  return p.exists()
590
632
 
591
- __ALLOWED_SCHEMES = [None, 'file', 'dbfs']
633
+ __ALLOWED_SCHEMES = [None, "file", "dbfs"]
592
634
 
593
635
  def _path(self, src):
594
636
  src = parse.urlparse(str(src))
595
637
  if src.scheme and src.scheme not in self.__ALLOWED_SCHEMES:
596
638
  raise ValueError(
597
639
  f'unsupported scheme "{src.scheme}". DBUtils in the SDK only supports local, root DBFS, and '
598
- 'UC Volumes paths, not external locations or DBFS mount points.')
599
- if src.scheme == 'file':
640
+ "UC Volumes paths, not external locations or DBFS mount points."
641
+ )
642
+ if src.scheme == "file":
600
643
  return _LocalPath(src.geturl())
601
- if src.path.startswith('/Volumes'):
644
+ if src.path.startswith("/Volumes"):
602
645
  return _VolumesPath(self._files_api, src.geturl())
603
646
  return _DbfsPath(self._dbfs_api, src.geturl())
604
647
 
@@ -607,7 +650,7 @@ class DbfsExt(files.DbfsAPI):
607
650
  src = self._path(src)
608
651
  dst = self._path(dst)
609
652
  if src.is_local and dst.is_local:
610
- raise IOError('both destinations are on local FS')
653
+ raise IOError("both destinations are on local FS")
611
654
  if dst.exists() and dst.is_dir:
612
655
  # if target is a folder, make file with the same name there
613
656
  dst = dst.child(src.name)
@@ -630,11 +673,11 @@ class DbfsExt(files.DbfsAPI):
630
673
  # this operation is recursive by default.
631
674
  return self.move(source.as_string, target.as_string)
632
675
  if source.is_local and target.is_local:
633
- raise IOError('both destinations are on local FS')
676
+ raise IOError("both destinations are on local FS")
634
677
  if source.is_dir and not recursive:
635
- src_type = 'local' if source.is_local else 'DBFS' if source.is_dbfs else 'UC Volume'
636
- dst_type = 'local' if target.is_local else 'DBFS' if target.is_dbfs else 'UC Volume'
637
- raise IOError(f'moving a directory from {src_type} to {dst_type} requires recursive flag')
678
+ src_type = "local" if source.is_local else "DBFS" if source.is_dbfs else "UC Volume"
679
+ dst_type = "local" if target.is_local else "DBFS" if target.is_dbfs else "UC Volume"
680
+ raise IOError(f"moving a directory from {src_type} to {dst_type} requires recursive flag")
638
681
  # do cross-fs moving
639
682
  self.copy(src, dst, recursive=recursive, overwrite=overwrite)
640
683
  self.delete(src, recursive=recursive)
@@ -643,16 +686,20 @@ class DbfsExt(files.DbfsAPI):
643
686
  """Delete file or directory on DBFS"""
644
687
  p = self._path(path)
645
688
  if p.is_dir and not recursive:
646
- raise IOError('deleting directories requires recursive flag')
689
+ raise IOError("deleting directories requires recursive flag")
647
690
  p.delete(recursive=recursive)
648
691
 
649
692
 
650
693
  class FilesExt(files.FilesAPI):
651
694
  __doc__ = files.FilesAPI.__doc__
652
695
 
696
+ # note that these error codes are retryable only for idempotent operations
697
+ _RETRYABLE_STATUS_CODES = [408, 429, 500, 502, 503, 504]
698
+
653
699
  def __init__(self, api_client, config: Config):
654
700
  super().__init__(api_client)
655
701
  self._config = config.copy()
702
+ self._multipart_upload_read_ahead_bytes = 1
656
703
 
657
704
  def download(self, file_path: str) -> DownloadResponse:
658
705
  """Download a file.
@@ -670,56 +717,660 @@ class FilesExt(files.FilesAPI):
670
717
  :returns: :class:`DownloadResponse`
671
718
  """
672
719
 
673
- initial_response: DownloadResponse = self._download_raw_stream(file_path=file_path,
674
- start_byte_offset=0,
675
- if_unmodified_since_timestamp=None)
720
+ initial_response: DownloadResponse = self._open_download_stream(
721
+ file_path=file_path,
722
+ start_byte_offset=0,
723
+ if_unmodified_since_timestamp=None,
724
+ )
676
725
 
677
726
  wrapped_response = self._wrap_stream(file_path, initial_response)
678
727
  initial_response.contents._response = wrapped_response
679
728
  return initial_response
680
729
 
681
- def _download_raw_stream(self,
682
- file_path: str,
683
- start_byte_offset: int,
684
- if_unmodified_since_timestamp: Optional[str] = None) -> DownloadResponse:
685
- headers = {'Accept': 'application/octet-stream', }
730
+ def upload(self, file_path: str, contents: BinaryIO, *, overwrite: Optional[bool] = None):
731
+ """Upload a file.
732
+
733
+ Uploads a file. The file contents should be sent as the request body as raw bytes (an
734
+ octet stream); do not encode or otherwise modify the bytes before sending. The contents of the
735
+ resulting file will be exactly the bytes sent in the request body. If the request is successful, there
736
+ is no response body.
737
+
738
+ :param file_path: str
739
+ The absolute remote path of the target file.
740
+ :param contents: BinaryIO
741
+ :param overwrite: bool (optional)
742
+ If true, an existing file will be overwritten. When not specified, assumed True.
743
+ """
744
+
745
+ # Upload empty and small files with one-shot upload.
746
+ pre_read_buffer = contents.read(self._config.multipart_upload_min_stream_size)
747
+ if len(pre_read_buffer) < self._config.multipart_upload_min_stream_size:
748
+ _LOG.debug(
749
+ f"Using one-shot upload for input stream of size {len(pre_read_buffer)} below {self._config.multipart_upload_min_stream_size} bytes"
750
+ )
751
+ return super().upload(file_path=file_path, contents=BytesIO(pre_read_buffer), overwrite=overwrite)
752
+
753
+ query = {"action": "initiate-upload"}
754
+ if overwrite is not None:
755
+ query["overwrite"] = overwrite
756
+
757
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
758
+ initiate_upload_response = self._api.do(
759
+ "POST", f"/api/2.0/fs/files{_escape_multi_segment_path_parameter(file_path)}", query=query
760
+ )
761
+
762
+ if initiate_upload_response.get("multipart_upload"):
763
+ cloud_provider_session = self._create_cloud_provider_session()
764
+ session_token = initiate_upload_response["multipart_upload"].get("session_token")
765
+ if not session_token:
766
+ raise ValueError(f"Unexpected server response: {initiate_upload_response}")
767
+
768
+ try:
769
+ self._perform_multipart_upload(
770
+ file_path, contents, session_token, pre_read_buffer, cloud_provider_session
771
+ )
772
+ except Exception as e:
773
+ _LOG.info(f"Aborting multipart upload on error: {e}")
774
+ try:
775
+ self._abort_multipart_upload(file_path, session_token, cloud_provider_session)
776
+ except BaseException as ex:
777
+ _LOG.warning(f"Failed to abort upload: {ex}")
778
+ # ignore, abort is a best-effort
779
+ finally:
780
+ # rethrow original exception
781
+ raise e from None
782
+
783
+ elif initiate_upload_response.get("resumable_upload"):
784
+ cloud_provider_session = self._create_cloud_provider_session()
785
+ session_token = initiate_upload_response["resumable_upload"]["session_token"]
786
+ self._perform_resumable_upload(
787
+ file_path, contents, session_token, overwrite, pre_read_buffer, cloud_provider_session
788
+ )
789
+ else:
790
+ raise ValueError(f"Unexpected server response: {initiate_upload_response}")
791
+
792
+ def _perform_multipart_upload(
793
+ self,
794
+ target_path: str,
795
+ input_stream: BinaryIO,
796
+ session_token: str,
797
+ pre_read_buffer: bytes,
798
+ cloud_provider_session: requests.Session,
799
+ ):
800
+ """
801
+ Performs multipart upload using presigned URLs on AWS and Azure:
802
+ https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html
803
+ """
804
+ current_part_number = 1
805
+ etags: dict = {}
806
+
807
+ # Why are we buffering the current chunk?
808
+ # AWS and Azure don't support traditional "Transfer-encoding: chunked", so we must
809
+ # provide each chunk size up front. In case of a non-seekable input stream we need
810
+ # to buffer a chunk before uploading to know its size. This also allows us to rewind
811
+ # the stream before retrying on request failure.
812
+ # AWS signed chunked upload: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html
813
+ # https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-tune-upload-download-python#buffering-during-uploads
814
+
815
+ chunk_offset = 0 # used only for logging
816
+
817
+ # This buffer is expected to contain at least multipart_upload_chunk_size bytes.
818
+ # Note that initially buffer can be bigger (from pre_read_buffer).
819
+ buffer = pre_read_buffer
820
+
821
+ retry_count = 0
822
+ eof = False
823
+ while not eof:
824
+ # If needed, buffer the next chunk.
825
+ buffer = FilesExt._fill_buffer(buffer, self._config.multipart_upload_chunk_size, input_stream)
826
+ if len(buffer) == 0:
827
+ # End of stream, no need to request the next block of upload URLs.
828
+ break
829
+
830
+ _LOG.debug(
831
+ f"Multipart upload: requesting next {self._config.multipart_upload_batch_url_count} upload URLs starting from part {current_part_number}"
832
+ )
833
+
834
+ body: dict = {
835
+ "path": target_path,
836
+ "session_token": session_token,
837
+ "start_part_number": current_part_number,
838
+ "count": self._config.multipart_upload_batch_url_count,
839
+ "expire_time": self._get_url_expire_time(),
840
+ }
841
+
842
+ headers = {"Content-Type": "application/json"}
843
+
844
+ # Requesting URLs for the same set of parts is an idempotent operation, safe to retry.
845
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
846
+ upload_part_urls_response = self._api.do(
847
+ "POST", "/api/2.0/fs/create-upload-part-urls", headers=headers, body=body
848
+ )
849
+
850
+ upload_part_urls = upload_part_urls_response.get("upload_part_urls", [])
851
+ if len(upload_part_urls) == 0:
852
+ raise ValueError(f"Unexpected server response: {upload_part_urls_response}")
853
+
854
+ for upload_part_url in upload_part_urls:
855
+ buffer = FilesExt._fill_buffer(buffer, self._config.multipart_upload_chunk_size, input_stream)
856
+ actual_buffer_length = len(buffer)
857
+ if actual_buffer_length == 0:
858
+ eof = True
859
+ break
860
+
861
+ url = upload_part_url["url"]
862
+ required_headers = upload_part_url.get("headers", [])
863
+ assert current_part_number == upload_part_url["part_number"]
864
+
865
+ headers: dict = {"Content-Type": "application/octet-stream"}
866
+ for h in required_headers:
867
+ headers[h["name"]] = h["value"]
868
+
869
+ actual_chunk_length = min(actual_buffer_length, self._config.multipart_upload_chunk_size)
870
+ _LOG.debug(
871
+ f"Uploading part {current_part_number}: [{chunk_offset}, {chunk_offset + actual_chunk_length - 1}]"
872
+ )
873
+
874
+ chunk = BytesIO(buffer[:actual_chunk_length])
875
+
876
+ def rewind():
877
+ chunk.seek(0, os.SEEK_SET)
878
+
879
+ def perform():
880
+ return cloud_provider_session.request(
881
+ "PUT",
882
+ url,
883
+ headers=headers,
884
+ data=chunk,
885
+ timeout=self._config.multipart_upload_single_chunk_upload_timeout_seconds,
886
+ )
887
+
888
+ upload_response = self._retry_idempotent_operation(perform, rewind)
889
+
890
+ if upload_response.status_code in (200, 201):
891
+ # Chunk upload successful
892
+
893
+ chunk_offset += actual_chunk_length
894
+
895
+ etag = upload_response.headers.get("ETag", "")
896
+ etags[current_part_number] = etag
897
+
898
+ # Discard uploaded bytes
899
+ buffer = buffer[actual_chunk_length:]
900
+
901
+ # Reset retry count when progressing along the stream
902
+ retry_count = 0
903
+
904
+ elif FilesExt._is_url_expired_response(upload_response):
905
+ if retry_count < self._config.multipart_upload_max_retries:
906
+ retry_count += 1
907
+ _LOG.debug("Upload URL expired")
908
+ # Preserve the buffer so we'll upload the current part again using next upload URL
909
+ else:
910
+ # don't confuse user with unrelated "Permission denied" error.
911
+ raise ValueError(f"Unsuccessful chunk upload: upload URL expired")
912
+
913
+ else:
914
+ message = f"Unsuccessful chunk upload. Response status: {upload_response.status_code}, body: {upload_response.content}"
915
+ _LOG.warning(message)
916
+ mapped_error = _error_mapper(upload_response, {})
917
+ raise mapped_error or ValueError(message)
918
+
919
+ current_part_number += 1
920
+
921
+ _LOG.debug(
922
+ f"Completing multipart upload after uploading {len(etags)} parts of up to {self._config.multipart_upload_chunk_size} bytes"
923
+ )
924
+
925
+ query = {"action": "complete-upload", "upload_type": "multipart", "session_token": session_token}
926
+ headers = {"Content-Type": "application/json"}
927
+ body: dict = {}
928
+
929
+ parts = []
930
+ for etag in sorted(etags.items()):
931
+ part = {"part_number": etag[0], "etag": etag[1]}
932
+ parts.append(part)
933
+
934
+ body["parts"] = parts
935
+
936
+ # Completing upload is an idempotent operation, safe to retry.
937
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
938
+ self._api.do(
939
+ "POST",
940
+ f"/api/2.0/fs/files{_escape_multi_segment_path_parameter(target_path)}",
941
+ query=query,
942
+ headers=headers,
943
+ body=body,
944
+ )
945
+
946
+ @staticmethod
947
+ def _fill_buffer(buffer: bytes, desired_min_size: int, input_stream: BinaryIO):
948
+ """
949
+ Tries to fill given buffer to contain at least `desired_min_size` bytes by reading from input stream.
950
+ """
951
+ bytes_to_read = max(0, desired_min_size - len(buffer))
952
+ if bytes_to_read > 0:
953
+ next_buf = input_stream.read(bytes_to_read)
954
+ new_buffer = buffer + next_buf
955
+ return new_buffer
956
+ else:
957
+ # we have already buffered enough data
958
+ return buffer
959
+
960
+ @staticmethod
961
+ def _is_url_expired_response(response: requests.Response):
962
+ """
963
+ Checks if response matches one of the known "URL expired" responses from the cloud storage providers.
964
+ """
965
+ if response.status_code != 403:
966
+ return False
967
+
968
+ try:
969
+ xml_root = ET.fromstring(response.content)
970
+ if xml_root.tag != "Error":
971
+ return False
972
+
973
+ code = xml_root.find("Code")
974
+ if code is None:
975
+ return False
976
+
977
+ if code.text == "AuthenticationFailed":
978
+ # Azure
979
+ details = xml_root.find("AuthenticationErrorDetail")
980
+ if details is not None and "Signature not valid in the specified time frame" in details.text:
981
+ return True
982
+
983
+ if code.text == "AccessDenied":
984
+ # AWS
985
+ message = xml_root.find("Message")
986
+ if message is not None and message.text == "Request has expired":
987
+ return True
988
+
989
+ except ET.ParseError:
990
+ pass
991
+
992
+ return False
993
+
994
+ def _perform_resumable_upload(
995
+ self,
996
+ target_path: str,
997
+ input_stream: BinaryIO,
998
+ session_token: str,
999
+ overwrite: bool,
1000
+ pre_read_buffer: bytes,
1001
+ cloud_provider_session: requests.Session,
1002
+ ):
1003
+ """
1004
+ Performs resumable upload on GCP: https://cloud.google.com/storage/docs/performing-resumable-uploads
1005
+ """
1006
+
1007
+ # Session URI we're using expires after a week
1008
+
1009
+ # Why are we buffering the current chunk?
1010
+ # When using resumable upload API we're uploading data in chunks. During chunk upload
1011
+ # server responds with the "received offset" confirming how much data it stored so far,
1012
+ # so we should continue uploading from that offset. (Note this is not a failure but an
1013
+ # expected behaviour as per the docs.) But, input stream might be consumed beyond that
1014
+ # offset, since server might have read more data than it confirmed received, or some data
1015
+ # might have been pre-cached by e.g. OS or a proxy. So, to continue upload, we must rewind
1016
+ # the input stream back to the byte next to "received offset". This is not possible
1017
+ # for non-seekable input stream, so we must buffer the whole last chunk and seek inside
1018
+ # the buffer. By always uploading from the buffer we fully support non-seekable streams.
1019
+
1020
+ # Why are we doing read-ahead?
1021
+ # It's not possible to upload an empty chunk as "Content-Range" header format does not
1022
+ # support this. So if current chunk happens to finish exactly at the end of the stream,
1023
+ # we need to know that and mark the chunk as last (by passing real file size in the
1024
+ # "Content-Range" header) when uploading it. To detect if we're at the end of the stream
1025
+ # we're reading "ahead" an extra bytes but not uploading them immediately. If
1026
+ # nothing has been read ahead, it means we're at the end of the stream.
1027
+ # On the contrary, in multipart upload we can decide to complete upload *after*
1028
+ # last chunk has been sent.
1029
+
1030
+ body: dict = {"path": target_path, "session_token": session_token}
1031
+
1032
+ headers = {"Content-Type": "application/json"}
1033
+
1034
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
1035
+ resumable_upload_url_response = self._api.do(
1036
+ "POST", "/api/2.0/fs/create-resumable-upload-url", headers=headers, body=body
1037
+ )
1038
+
1039
+ resumable_upload_url_node = resumable_upload_url_response.get("resumable_upload_url")
1040
+ if not resumable_upload_url_node:
1041
+ raise ValueError(f"Unexpected server response: {resumable_upload_url_response}")
1042
+
1043
+ resumable_upload_url = resumable_upload_url_node.get("url")
1044
+ if not resumable_upload_url:
1045
+ raise ValueError(f"Unexpected server response: {resumable_upload_url_response}")
1046
+
1047
+ required_headers = resumable_upload_url_node.get("headers", [])
1048
+
1049
+ try:
1050
+ # We will buffer this many bytes: one chunk + read-ahead block.
1051
+ # Note buffer may contain more data initially (from pre_read_buffer).
1052
+ min_buffer_size = self._config.multipart_upload_chunk_size + self._multipart_upload_read_ahead_bytes
1053
+
1054
+ buffer = pre_read_buffer
1055
+
1056
+ # How many bytes in the buffer were confirmed to be received by the server.
1057
+ # All the remaining bytes in the buffer must be uploaded.
1058
+ uploaded_bytes_count = 0
1059
+
1060
+ chunk_offset = 0
1061
+
1062
+ retry_count = 0
1063
+ while True:
1064
+ # If needed, fill the buffer to contain at least min_buffer_size bytes
1065
+ # (unless end of stream), discarding already uploaded bytes.
1066
+ bytes_to_read = max(0, min_buffer_size - (len(buffer) - uploaded_bytes_count))
1067
+ next_buf = input_stream.read(bytes_to_read)
1068
+ buffer = buffer[uploaded_bytes_count:] + next_buf
1069
+
1070
+ if len(next_buf) < bytes_to_read:
1071
+ # This is the last chunk in the stream.
1072
+ # Let's upload all the remaining bytes in one go.
1073
+ actual_chunk_length = len(buffer)
1074
+ file_size = chunk_offset + actual_chunk_length
1075
+ else:
1076
+ # More chunks expected, let's upload current chunk (excluding read-ahead block).
1077
+ actual_chunk_length = self._config.multipart_upload_chunk_size
1078
+ file_size = "*"
1079
+
1080
+ headers: dict = {"Content-Type": "application/octet-stream"}
1081
+ for h in required_headers:
1082
+ headers[h["name"]] = h["value"]
1083
+
1084
+ chunk_last_byte_offset = chunk_offset + actual_chunk_length - 1
1085
+ content_range_header = f"bytes {chunk_offset}-{chunk_last_byte_offset}/{file_size}"
1086
+ _LOG.debug(f"Uploading chunk: {content_range_header}")
1087
+ headers["Content-Range"] = content_range_header
1088
+
1089
+ def retrieve_upload_status() -> Optional[requests.Response]:
1090
+ def perform():
1091
+ return cloud_provider_session.request(
1092
+ "PUT",
1093
+ resumable_upload_url,
1094
+ headers={"Content-Range": "bytes */*"},
1095
+ data=b"",
1096
+ timeout=self._config.multipart_upload_single_chunk_upload_timeout_seconds,
1097
+ )
1098
+
1099
+ try:
1100
+ return self._retry_idempotent_operation(perform)
1101
+ except RequestException:
1102
+ _LOG.warning("Failed to retrieve upload status")
1103
+ return None
1104
+
1105
+ try:
1106
+ upload_response = cloud_provider_session.request(
1107
+ "PUT",
1108
+ resumable_upload_url,
1109
+ headers=headers,
1110
+ data=BytesIO(buffer[:actual_chunk_length]),
1111
+ timeout=self._config.multipart_upload_single_chunk_upload_timeout_seconds,
1112
+ )
1113
+
1114
+ # https://cloud.google.com/storage/docs/performing-resumable-uploads#resume-upload
1115
+ # If an upload request is terminated before receiving a response, or if you receive
1116
+ # a 503 or 500 response, then you need to resume the interrupted upload from where it left off.
1117
+
1118
+ # Let's follow that for all potentially retryable status codes.
1119
+ # Together with the catch block below we replicate the logic in _retry_idempotent_operation().
1120
+ if upload_response.status_code in self._RETRYABLE_STATUS_CODES:
1121
+ if retry_count < self._config.multipart_upload_max_retries:
1122
+ retry_count += 1
1123
+ # let original upload_response be handled as an error
1124
+ upload_response = retrieve_upload_status() or upload_response
1125
+ else:
1126
+ # we received non-retryable response, reset retry count
1127
+ retry_count = 0
1128
+
1129
+ except RequestException as e:
1130
+ # Let's do the same for retryable network errors.
1131
+ if _BaseClient._is_retryable(e) and retry_count < self._config.multipart_upload_max_retries:
1132
+ retry_count += 1
1133
+ upload_response = retrieve_upload_status()
1134
+ if not upload_response:
1135
+ # rethrow original exception
1136
+ raise e from None
1137
+ else:
1138
+ # rethrow original exception
1139
+ raise e from None
1140
+
1141
+ if upload_response.status_code in (200, 201):
1142
+ if file_size == "*":
1143
+ raise ValueError(
1144
+ f"Received unexpected status {upload_response.status_code} before reaching end of stream"
1145
+ )
1146
+
1147
+ # upload complete
1148
+ break
1149
+
1150
+ elif upload_response.status_code == 308:
1151
+ # chunk accepted (or check-status succeeded), let's determine received offset to resume from there
1152
+ range_string = upload_response.headers.get("Range")
1153
+ confirmed_offset = self._extract_range_offset(range_string)
1154
+ _LOG.debug(f"Received confirmed offset: {confirmed_offset}")
1155
+
1156
+ if confirmed_offset:
1157
+ if confirmed_offset < chunk_offset - 1 or confirmed_offset > chunk_last_byte_offset:
1158
+ raise ValueError(
1159
+ f"Unexpected received offset: {confirmed_offset} is outside of expected range, chunk offset: {chunk_offset}, chunk last byte offset: {chunk_last_byte_offset}"
1160
+ )
1161
+ else:
1162
+ if chunk_offset > 0:
1163
+ raise ValueError(
1164
+ f"Unexpected received offset: {confirmed_offset} is outside of expected range, chunk offset: {chunk_offset}, chunk last byte offset: {chunk_last_byte_offset}"
1165
+ )
1166
+
1167
+ # We have just uploaded a part of chunk starting from offset "chunk_offset" and ending
1168
+ # at offset "confirmed_offset" (inclusive), so the next chunk will start at
1169
+ # offset "confirmed_offset + 1"
1170
+ if confirmed_offset:
1171
+ next_chunk_offset = confirmed_offset + 1
1172
+ else:
1173
+ next_chunk_offset = chunk_offset
1174
+ uploaded_bytes_count = next_chunk_offset - chunk_offset
1175
+ chunk_offset = next_chunk_offset
1176
+
1177
+ elif upload_response.status_code == 412 and not overwrite:
1178
+ # Assuming this is only possible reason
1179
+ # Full message in this case: "At least one of the pre-conditions you specified did not hold."
1180
+ raise AlreadyExists("The file being created already exists.")
1181
+
1182
+ else:
1183
+ message = f"Unsuccessful chunk upload. Response status: {upload_response.status_code}, body: {upload_response.content}"
1184
+ _LOG.warning(message)
1185
+ mapped_error = _error_mapper(upload_response, {})
1186
+ raise mapped_error or ValueError(message)
1187
+
1188
+ except Exception as e:
1189
+ _LOG.info(f"Aborting resumable upload on error: {e}")
1190
+ try:
1191
+ self._abort_resumable_upload(resumable_upload_url, required_headers, cloud_provider_session)
1192
+ except BaseException as ex:
1193
+ _LOG.warning(f"Failed to abort upload: {ex}")
1194
+ # ignore, abort is a best-effort
1195
+ finally:
1196
+ # rethrow original exception
1197
+ raise e from None
1198
+
1199
+ @staticmethod
1200
+ def _extract_range_offset(range_string: Optional[str]) -> Optional[int]:
1201
+ """Parses the response range header to extract the last byte."""
1202
+ if not range_string:
1203
+ return None # server did not yet confirm any bytes
1204
+
1205
+ if match := re.match("bytes=0-(\\d+)", range_string):
1206
+ return int(match.group(1))
1207
+ else:
1208
+ raise ValueError(f"Cannot parse response header: Range: {range_string}")
1209
+
1210
+ def _get_url_expire_time(self):
1211
+ """Generates expiration time and save it in the required format."""
1212
+ current_time = datetime.datetime.now(datetime.timezone.utc)
1213
+ expire_time = current_time + self._config.multipart_upload_url_expiration_duration
1214
+ # From Google Protobuf doc:
1215
+ # In JSON format, the Timestamp type is encoded as a string in the
1216
+ # * [RFC 3339](https://www.ietf.org/rfc/rfc3339.txt) format. That is, the
1217
+ # * format is "{year}-{month}-{day}T{hour}:{min}:{sec}[.{frac_sec}]Z"
1218
+ return expire_time.strftime("%Y-%m-%dT%H:%M:%SZ")
1219
+
1220
+ def _abort_multipart_upload(self, target_path: str, session_token: str, cloud_provider_session: requests.Session):
1221
+ """Aborts ongoing multipart upload session to clean up incomplete file."""
1222
+ body: dict = {"path": target_path, "session_token": session_token, "expire_time": self._get_url_expire_time()}
1223
+
1224
+ headers = {"Content-Type": "application/json"}
1225
+
1226
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
1227
+ abort_url_response = self._api.do("POST", "/api/2.0/fs/create-abort-upload-url", headers=headers, body=body)
1228
+
1229
+ abort_upload_url_node = abort_url_response["abort_upload_url"]
1230
+ abort_url = abort_upload_url_node["url"]
1231
+ required_headers = abort_upload_url_node.get("headers", [])
1232
+
1233
+ headers: dict = {"Content-Type": "application/octet-stream"}
1234
+ for h in required_headers:
1235
+ headers[h["name"]] = h["value"]
1236
+
1237
+ def perform():
1238
+ return cloud_provider_session.request(
1239
+ "DELETE",
1240
+ abort_url,
1241
+ headers=headers,
1242
+ data=b"",
1243
+ timeout=self._config.multipart_upload_single_chunk_upload_timeout_seconds,
1244
+ )
1245
+
1246
+ abort_response = self._retry_idempotent_operation(perform)
1247
+
1248
+ if abort_response.status_code not in (200, 201):
1249
+ raise ValueError(abort_response)
1250
+
1251
+ def _abort_resumable_upload(
1252
+ self, resumable_upload_url: str, required_headers: list, cloud_provider_session: requests.Session
1253
+ ):
1254
+ """Aborts ongoing resumable upload session to clean up incomplete file."""
1255
+ headers: dict = {}
1256
+ for h in required_headers:
1257
+ headers[h["name"]] = h["value"]
1258
+
1259
+ def perform():
1260
+ return cloud_provider_session.request(
1261
+ "DELETE",
1262
+ resumable_upload_url,
1263
+ headers=headers,
1264
+ data=b"",
1265
+ timeout=self._config.multipart_upload_single_chunk_upload_timeout_seconds,
1266
+ )
1267
+
1268
+ abort_response = self._retry_idempotent_operation(perform)
1269
+
1270
+ if abort_response.status_code not in (200, 201):
1271
+ raise ValueError(abort_response)
1272
+
1273
+ def _create_cloud_provider_session(self):
1274
+ """Creates a separate session which does not inherit auth headers from BaseClient session."""
1275
+ session = requests.Session()
1276
+
1277
+ # following session config in _BaseClient
1278
+ http_adapter = requests.adapters.HTTPAdapter(
1279
+ self._config.max_connection_pools or 20, self._config.max_connections_per_pool or 20, pool_block=True
1280
+ )
1281
+ session.mount("https://", http_adapter)
1282
+ # presigned URL for storage proxy can use plain HTTP
1283
+ session.mount("http://", http_adapter)
1284
+ return session
1285
+
1286
+ def _retry_idempotent_operation(
1287
+ self, operation: Callable[[], requests.Response], before_retry: Callable = None
1288
+ ) -> requests.Response:
1289
+ """Perform given idempotent operation with necessary retries. Since operation is idempotent it's
1290
+ safe to retry it for response codes where server state might have changed.
1291
+ """
1292
+
1293
+ def delegate():
1294
+ response = operation()
1295
+ if response.status_code in self._RETRYABLE_STATUS_CODES:
1296
+ attrs = {}
1297
+ # this will assign "retry_after_secs" to the attrs, essentially making exception look retryable
1298
+ _RetryAfterCustomizer().customize_error(response, attrs)
1299
+ raise _error_mapper(response, attrs)
1300
+ else:
1301
+ return response
1302
+
1303
+ # following _BaseClient timeout
1304
+ retry_timeout_seconds = self._config.retry_timeout_seconds or 300
1305
+
1306
+ return retried(
1307
+ timeout=timedelta(seconds=retry_timeout_seconds),
1308
+ # also retry on network errors (connection error, connection timeout)
1309
+ # where we believe request didn't reach the server
1310
+ is_retryable=_BaseClient._is_retryable,
1311
+ before_retry=before_retry,
1312
+ )(delegate)()
1313
+
1314
+ def _open_download_stream(
1315
+ self, file_path: str, start_byte_offset: int, if_unmodified_since_timestamp: Optional[str] = None
1316
+ ) -> DownloadResponse:
1317
+ """Opens a download stream from given offset, performing necessary retries."""
1318
+ headers = {
1319
+ "Accept": "application/octet-stream",
1320
+ }
686
1321
 
687
1322
  if start_byte_offset and not if_unmodified_since_timestamp:
688
1323
  raise Exception("if_unmodified_since_timestamp is required if start_byte_offset is specified")
689
1324
 
690
1325
  if start_byte_offset:
691
- headers['Range'] = f'bytes={start_byte_offset}-'
1326
+ headers["Range"] = f"bytes={start_byte_offset}-"
692
1327
 
693
1328
  if if_unmodified_since_timestamp:
694
- headers['If-Unmodified-Since'] = if_unmodified_since_timestamp
695
-
696
- response_headers = ['content-length', 'content-type', 'last-modified', ]
697
- res = self._api.do('GET',
698
- f'/api/2.0/fs/files{_escape_multi_segment_path_parameter(file_path)}',
699
- headers=headers,
700
- response_headers=response_headers,
701
- raw=True)
1329
+ headers["If-Unmodified-Since"] = if_unmodified_since_timestamp
1330
+
1331
+ response_headers = [
1332
+ "content-length",
1333
+ "content-type",
1334
+ "last-modified",
1335
+ ]
1336
+ # Method _api.do() takes care of retrying and will raise an exception in case of failure.
1337
+ res = self._api.do(
1338
+ "GET",
1339
+ f"/api/2.0/fs/files{_escape_multi_segment_path_parameter(file_path)}",
1340
+ headers=headers,
1341
+ response_headers=response_headers,
1342
+ raw=True,
1343
+ )
702
1344
 
703
1345
  result = DownloadResponse.from_dict(res)
704
1346
  if not isinstance(result.contents, _StreamingResponse):
705
- raise Exception("Internal error: response contents is of unexpected type: " +
706
- type(result.contents).__name__)
1347
+ raise Exception(
1348
+ "Internal error: response contents is of unexpected type: " + type(result.contents).__name__
1349
+ )
707
1350
 
708
1351
  return result
709
1352
 
710
- def _wrap_stream(self, file_path: str, downloadResponse: DownloadResponse):
711
- underlying_response = _ResilientIterator._extract_raw_response(downloadResponse)
712
- return _ResilientResponse(self,
713
- file_path,
714
- downloadResponse.last_modified,
715
- offset=0,
716
- underlying_response=underlying_response)
1353
+ def _wrap_stream(self, file_path: str, download_response: DownloadResponse):
1354
+ underlying_response = _ResilientIterator._extract_raw_response(download_response)
1355
+ return _ResilientResponse(
1356
+ self,
1357
+ file_path,
1358
+ download_response.last_modified,
1359
+ offset=0,
1360
+ underlying_response=underlying_response,
1361
+ )
717
1362
 
718
1363
 
719
1364
  class _ResilientResponse(_RawResponse):
720
1365
 
721
- def __init__(self, api: FilesExt, file_path: str, file_last_modified: str, offset: int,
722
- underlying_response: _RawResponse):
1366
+ def __init__(
1367
+ self,
1368
+ api: FilesExt,
1369
+ file_path: str,
1370
+ file_last_modified: str,
1371
+ offset: int,
1372
+ underlying_response: _RawResponse,
1373
+ ):
723
1374
  self.api = api
724
1375
  self.file_path = file_path
725
1376
  self.underlying_response = underlying_response
@@ -728,11 +1379,17 @@ class _ResilientResponse(_RawResponse):
728
1379
 
729
1380
  def iter_content(self, chunk_size=1, decode_unicode=False):
730
1381
  if decode_unicode:
731
- raise ValueError('Decode unicode is not supported')
1382
+ raise ValueError("Decode unicode is not supported")
732
1383
 
733
1384
  iterator = self.underlying_response.iter_content(chunk_size=chunk_size, decode_unicode=False)
734
- self.iterator = _ResilientIterator(iterator, self.file_path, self.file_last_modified, self.offset,
735
- self.api, chunk_size)
1385
+ self.iterator = _ResilientIterator(
1386
+ iterator,
1387
+ self.file_path,
1388
+ self.file_last_modified,
1389
+ self.offset,
1390
+ self.api,
1391
+ chunk_size,
1392
+ )
736
1393
  return self.iterator
737
1394
 
738
1395
  def close(self):
@@ -744,12 +1401,21 @@ class _ResilientIterator(Iterator):
744
1401
  # and recovers from failures by requesting download from the current offset.
745
1402
 
746
1403
  @staticmethod
747
- def _extract_raw_response(download_response: DownloadResponse) -> _RawResponse:
748
- streaming_response: _StreamingResponse = download_response.contents # this is an instance of _StreamingResponse
1404
+ def _extract_raw_response(
1405
+ download_response: DownloadResponse,
1406
+ ) -> _RawResponse:
1407
+ streaming_response: _StreamingResponse = download_response.contents # this is an instance of _StreamingResponse
749
1408
  return streaming_response._response
750
1409
 
751
- def __init__(self, underlying_iterator, file_path: str, file_last_modified: str, offset: int,
752
- api: FilesExt, chunk_size: int):
1410
+ def __init__(
1411
+ self,
1412
+ underlying_iterator,
1413
+ file_path: str,
1414
+ file_last_modified: str,
1415
+ offset: int,
1416
+ api: FilesExt,
1417
+ chunk_size: int,
1418
+ ):
753
1419
  self._underlying_iterator = underlying_iterator
754
1420
  self._api = api
755
1421
  self._file_path = file_path
@@ -768,14 +1434,18 @@ class _ResilientIterator(Iterator):
768
1434
  if self._total_recovers_count == self._api._config.files_api_client_download_max_total_recovers:
769
1435
  _LOG.debug("Total recovers limit exceeded")
770
1436
  return False
771
- if self._api._config.files_api_client_download_max_total_recovers_without_progressing is not None and self._recovers_without_progressing_count >= self._api._config.files_api_client_download_max_total_recovers_without_progressing:
1437
+ if (
1438
+ self._api._config.files_api_client_download_max_total_recovers_without_progressing is not None
1439
+ and self._recovers_without_progressing_count
1440
+ >= self._api._config.files_api_client_download_max_total_recovers_without_progressing
1441
+ ):
772
1442
  _LOG.debug("No progression recovers limit exceeded")
773
1443
  return False
774
1444
  return True
775
1445
 
776
1446
  def _recover(self) -> bool:
777
1447
  if not self._should_recover():
778
- return False # recover suppressed, rethrow original exception
1448
+ return False # recover suppressed, rethrow original exception
779
1449
 
780
1450
  self._total_recovers_count += 1
781
1451
  self._recovers_without_progressing_count += 1
@@ -786,15 +1456,15 @@ class _ResilientIterator(Iterator):
786
1456
  _LOG.debug("Trying to recover from offset " + str(self._offset))
787
1457
 
788
1458
  # following call includes all the required network retries
789
- downloadResponse = self._api._download_raw_stream(self._file_path, self._offset,
790
- self._file_last_modified)
1459
+ downloadResponse = self._api._open_download_stream(self._file_path, self._offset, self._file_last_modified)
791
1460
  underlying_response = _ResilientIterator._extract_raw_response(downloadResponse)
792
- self._underlying_iterator = underlying_response.iter_content(chunk_size=self._chunk_size,
793
- decode_unicode=False)
1461
+ self._underlying_iterator = underlying_response.iter_content(
1462
+ chunk_size=self._chunk_size, decode_unicode=False
1463
+ )
794
1464
  _LOG.debug("Recover succeeded")
795
1465
  return True
796
1466
  except:
797
- return False # recover failed, rethrow original exception
1467
+ return False # recover failed, rethrow original exception
798
1468
 
799
1469
  def __next__(self):
800
1470
  if self._closed: