azure-storage-blob 12.21.0b1__py3-none-any.whl → 12.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. azure/storage/blob/__init__.py +19 -18
  2. azure/storage/blob/_blob_client.py +470 -1555
  3. azure/storage/blob/_blob_client_helpers.py +1242 -0
  4. azure/storage/blob/_blob_service_client.py +93 -112
  5. azure/storage/blob/_blob_service_client_helpers.py +27 -0
  6. azure/storage/blob/_container_client.py +176 -377
  7. azure/storage/blob/_container_client_helpers.py +266 -0
  8. azure/storage/blob/_deserialize.py +68 -44
  9. azure/storage/blob/_download.py +375 -241
  10. azure/storage/blob/_encryption.py +14 -7
  11. azure/storage/blob/_generated/_azure_blob_storage.py +2 -1
  12. azure/storage/blob/_generated/_serialization.py +2 -0
  13. azure/storage/blob/_generated/aio/_azure_blob_storage.py +2 -1
  14. azure/storage/blob/_generated/aio/operations/_append_blob_operations.py +1 -7
  15. azure/storage/blob/_generated/aio/operations/_blob_operations.py +21 -47
  16. azure/storage/blob/_generated/aio/operations/_block_blob_operations.py +2 -10
  17. azure/storage/blob/_generated/aio/operations/_container_operations.py +13 -26
  18. azure/storage/blob/_generated/aio/operations/_page_blob_operations.py +3 -14
  19. azure/storage/blob/_generated/aio/operations/_service_operations.py +14 -17
  20. azure/storage/blob/_generated/operations/_append_blob_operations.py +1 -7
  21. azure/storage/blob/_generated/operations/_blob_operations.py +21 -47
  22. azure/storage/blob/_generated/operations/_block_blob_operations.py +2 -10
  23. azure/storage/blob/_generated/operations/_container_operations.py +13 -26
  24. azure/storage/blob/_generated/operations/_page_blob_operations.py +3 -14
  25. azure/storage/blob/_generated/operations/_service_operations.py +14 -17
  26. azure/storage/blob/_generated/py.typed +1 -0
  27. azure/storage/blob/_lease.py +52 -63
  28. azure/storage/blob/_list_blobs_helper.py +129 -135
  29. azure/storage/blob/_models.py +480 -277
  30. azure/storage/blob/_quick_query_helper.py +30 -31
  31. azure/storage/blob/_serialize.py +39 -56
  32. azure/storage/blob/_shared/avro/datafile.py +1 -1
  33. azure/storage/blob/_shared/avro/datafile_async.py +1 -1
  34. azure/storage/blob/_shared/base_client.py +3 -1
  35. azure/storage/blob/_shared/base_client_async.py +1 -1
  36. azure/storage/blob/_shared/policies.py +16 -15
  37. azure/storage/blob/_shared/policies_async.py +21 -6
  38. azure/storage/blob/_shared/response_handlers.py +6 -2
  39. azure/storage/blob/_shared/shared_access_signature.py +21 -3
  40. azure/storage/blob/_shared/uploads.py +1 -1
  41. azure/storage/blob/_shared/uploads_async.py +1 -1
  42. azure/storage/blob/_shared_access_signature.py +110 -52
  43. azure/storage/blob/_upload_helpers.py +75 -68
  44. azure/storage/blob/_version.py +1 -1
  45. azure/storage/blob/aio/__init__.py +19 -11
  46. azure/storage/blob/aio/_blob_client_async.py +554 -301
  47. azure/storage/blob/aio/_blob_service_client_async.py +148 -97
  48. azure/storage/blob/aio/_container_client_async.py +289 -140
  49. azure/storage/blob/aio/_download_async.py +485 -337
  50. azure/storage/blob/aio/_lease_async.py +61 -60
  51. azure/storage/blob/aio/_list_blobs_helper.py +94 -96
  52. azure/storage/blob/aio/_models.py +60 -38
  53. azure/storage/blob/aio/_upload_helpers.py +75 -66
  54. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.23.0.dist-info}/METADATA +7 -7
  55. azure_storage_blob-12.23.0.dist-info/RECORD +84 -0
  56. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.23.0.dist-info}/WHEEL +1 -1
  57. azure/storage/blob/_generated/_vendor.py +0 -16
  58. azure_storage_blob-12.21.0b1.dist-info/RECORD +0 -81
  59. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.23.0.dist-info}/LICENSE +0 -0
  60. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.23.0.dist-info}/top_level.txt +0 -0
@@ -3,19 +3,23 @@
3
3
  # Licensed under the MIT License. See License.txt in the project root for
4
4
  # license information.
5
5
  # --------------------------------------------------------------------------
6
-
6
+ import codecs
7
7
  import sys
8
8
  import threading
9
9
  import time
10
10
  import warnings
11
- from io import BytesIO
12
- from typing import Generic, IO, Iterator, Optional, TypeVar
11
+ from io import BytesIO, StringIO
12
+ from typing import (
13
+ Any, Callable, cast, Dict, Generator,
14
+ Generic, IO, Iterator, List, Optional,
15
+ overload, Tuple, TypeVar, Union, TYPE_CHECKING
16
+ )
13
17
 
14
18
  from azure.core.exceptions import DecodeError, HttpResponseError, IncompleteReadError
15
19
  from azure.core.tracing.common import with_current_context
16
20
 
17
21
  from ._shared.request_handlers import validate_and_format_range_headers
18
- from ._shared.response_handlers import process_storage_error, parse_length_from_content_range
22
+ from ._shared.response_handlers import parse_length_from_content_range, process_storage_error
19
23
  from ._deserialize import deserialize_blob_properties, get_page_ranges_result
20
24
  from ._encryption import (
21
25
  adjust_blob_size_for_encryption,
@@ -25,10 +29,25 @@ from ._encryption import (
25
29
  parse_encryption_data
26
30
  )
27
31
 
32
+ if TYPE_CHECKING:
33
+ from codecs import IncrementalDecoder
34
+ from ._encryption import _EncryptionData
35
+ from ._generated import AzureBlobStorage
36
+ from ._generated.operations import BlobOperations
37
+ from ._models import BlobProperties
38
+ from ._shared.models import StorageConfiguration
39
+
40
+
28
41
  T = TypeVar('T', bytes, str)
29
42
 
30
43
 
31
- def process_range_and_offset(start_range, end_range, length, encryption_options, encryption_data):
44
+ def process_range_and_offset(
45
+ start_range: int,
46
+ end_range: int,
47
+ length: Optional[int],
48
+ encryption_options: Dict[str, Any],
49
+ encryption_data: Optional["_EncryptionData"]
50
+ ) -> Tuple[Tuple[int, int], Tuple[int, int]]:
32
51
  start_offset, end_offset = 0, 0
33
52
  if encryption_options.get("key") is not None or encryption_options.get("resolver") is not None:
34
53
  return get_adjusted_download_range_and_offset(
@@ -40,7 +59,7 @@ def process_range_and_offset(start_range, end_range, length, encryption_options,
40
59
  return (start_range, end_range), (start_offset, end_offset)
41
60
 
42
61
 
43
- def process_content(data, start_offset, end_offset, encryption):
62
+ def process_content(data: Any, start_offset: int, end_offset: int, encryption: Dict[str, Any]) -> bytes:
44
63
  if data is None:
45
64
  raise ValueError("Response cannot be None.")
46
65
 
@@ -49,7 +68,7 @@ def process_content(data, start_offset, end_offset, encryption):
49
68
  if content and encryption.get("key") is not None or encryption.get("resolver") is not None:
50
69
  try:
51
70
  return decrypt_blob(
52
- encryption.get("required"),
71
+ encryption.get("required") or False,
53
72
  encryption.get("key"),
54
73
  encryption.get("resolver"),
55
74
  content,
@@ -65,21 +84,21 @@ def process_content(data, start_offset, end_offset, encryption):
65
84
  class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
66
85
  def __init__(
67
86
  self,
68
- client=None,
69
- non_empty_ranges=None,
70
- total_size=None,
71
- chunk_size=None,
72
- current_progress=None,
73
- start_range=None,
74
- end_range=None,
75
- stream=None,
76
- parallel=None,
77
- validate_content=None,
78
- encryption_options=None,
79
- encryption_data=None,
80
- progress_hook=None,
81
- **kwargs
82
- ):
87
+ client: "BlobOperations",
88
+ total_size: int,
89
+ chunk_size: int,
90
+ current_progress: int,
91
+ start_range: int,
92
+ end_range: int,
93
+ validate_content: bool,
94
+ encryption_options: Dict[str, Any],
95
+ encryption_data: Optional["_EncryptionData"] = None,
96
+ stream: Any = None,
97
+ parallel: Optional[int] = None,
98
+ non_empty_ranges: Optional[List[Dict[str, Any]]] = None,
99
+ progress_hook: Optional[Callable[[int, Optional[int]], None]] = None,
100
+ **kwargs: Any
101
+ ) -> None:
83
102
  self.client = client
84
103
  self.non_empty_ranges = non_empty_ranges
85
104
 
@@ -110,32 +129,32 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
110
129
  self.validate_content = validate_content
111
130
  self.request_options = kwargs
112
131
 
113
- def _calculate_range(self, chunk_start):
132
+ def _calculate_range(self, chunk_start: int) -> Tuple[int, int]:
114
133
  if chunk_start + self.chunk_size > self.end_index:
115
134
  chunk_end = self.end_index
116
135
  else:
117
136
  chunk_end = chunk_start + self.chunk_size
118
137
  return chunk_start, chunk_end
119
138
 
120
- def get_chunk_offsets(self):
139
+ def get_chunk_offsets(self) -> Generator[int, None, None]:
121
140
  index = self.start_index
122
141
  while index < self.end_index:
123
142
  yield index
124
143
  index += self.chunk_size
125
144
 
126
- def process_chunk(self, chunk_start):
145
+ def process_chunk(self, chunk_start: int) -> None:
127
146
  chunk_start, chunk_end = self._calculate_range(chunk_start)
128
- chunk_data = self._download_chunk(chunk_start, chunk_end - 1)
147
+ chunk_data, _ = self._download_chunk(chunk_start, chunk_end - 1)
129
148
  length = chunk_end - chunk_start
130
149
  if length > 0:
131
150
  self._write_to_stream(chunk_data, chunk_start)
132
151
  self._update_progress(length)
133
152
 
134
- def yield_chunk(self, chunk_start):
153
+ def yield_chunk(self, chunk_start: int) -> Tuple[bytes, int]:
135
154
  chunk_start, chunk_end = self._calculate_range(chunk_start)
136
155
  return self._download_chunk(chunk_start, chunk_end - 1)
137
156
 
138
- def _update_progress(self, length):
157
+ def _update_progress(self, length: int) -> None:
139
158
  if self.progress_lock:
140
159
  with self.progress_lock: # pylint: disable=not-context-manager
141
160
  self.progress_total += length
@@ -145,7 +164,7 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
145
164
  if self.progress_hook:
146
165
  self.progress_hook(self.progress_total, self.total_size)
147
166
 
148
- def _write_to_stream(self, chunk_data, chunk_start):
167
+ def _write_to_stream(self, chunk_data: bytes, chunk_start: int) -> None:
149
168
  if self.stream_lock:
150
169
  with self.stream_lock: # pylint: disable=not-context-manager
151
170
  self.stream.seek(self.stream_start + (chunk_start - self.start_index))
@@ -153,7 +172,7 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
153
172
  else:
154
173
  self.stream.write(chunk_data)
155
174
 
156
- def _do_optimize(self, given_range_start, given_range_end):
175
+ def _do_optimize(self, given_range_start: int, given_range_end: int) -> bool:
157
176
  # If we have no page range list stored, then assume there's data everywhere for that page blob
158
177
  # or it's a block blob or append blob
159
178
  if self.non_empty_ranges is None:
@@ -178,7 +197,9 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
178
197
  # Went through all src_ranges, but nothing overlapped. Optimization will be applied.
179
198
  return True
180
199
 
181
- def _download_chunk(self, chunk_start, chunk_end):
200
+ def _download_chunk(self, chunk_start: int, chunk_end: int) -> Tuple[bytes, int]:
201
+ if self.encryption_options is None:
202
+ raise ValueError("Required argument is missing: encryption_options")
182
203
  download_range, offset = process_range_and_offset(
183
204
  chunk_start, chunk_end, chunk_end, self.encryption_options, self.encryption_data
184
205
  )
@@ -186,8 +207,8 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
186
207
  # No need to download the empty chunk from server if there's no data in the chunk to be downloaded.
187
208
  # Do optimize and create empty chunk locally if condition is met.
188
209
  if self._do_optimize(download_range[0], download_range[1]):
189
- data_size = download_range[1] - download_range[0] + 1
190
- chunk_data = b"\x00" * data_size
210
+ content_length = download_range[1] - download_range[0] + 1
211
+ chunk_data = b"\x00" * content_length
191
212
  else:
192
213
  range_header, range_validation = validate_and_format_range_headers(
193
214
  download_range[0],
@@ -198,6 +219,7 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
198
219
  retry_active = True
199
220
  retry_total = 3
200
221
  while retry_active:
222
+ response: Any = None
201
223
  try:
202
224
  _, response = self.client.download(
203
225
  range=range_header,
@@ -218,34 +240,35 @@ class _ChunkDownloader(object): # pylint: disable=too-many-instance-attributes
218
240
  if retry_total <= 0:
219
241
  raise HttpResponseError(error, error=error) from error
220
242
  time.sleep(1)
243
+ content_length = response.content_length
221
244
 
222
245
  # This makes sure that if_match is set so that we can validate
223
246
  # that subsequent downloads are to an unmodified blob
224
247
  if self.request_options.get("modified_access_conditions"):
225
248
  self.request_options["modified_access_conditions"].if_match = response.properties.etag
226
249
 
227
- return chunk_data
250
+ return chunk_data, content_length
228
251
 
229
252
 
230
253
  class _ChunkIterator(object):
231
- """Async iterator for chunks in blob download stream."""
254
+ """Iterator for chunks in blob download stream."""
232
255
 
233
- def __init__(self, size, content, downloader, chunk_size):
256
+ def __init__(self, size: int, content: bytes, downloader: Optional[_ChunkDownloader], chunk_size: int) -> None:
234
257
  self.size = size
235
258
  self._chunk_size = chunk_size
236
259
  self._current_content = content
237
260
  self._iter_downloader = downloader
238
- self._iter_chunks = None
261
+ self._iter_chunks: Optional[Generator[int, None, None]] = None
239
262
  self._complete = size == 0
240
263
 
241
- def __len__(self):
264
+ def __len__(self) -> int:
242
265
  return self.size
243
266
 
244
- def __iter__(self):
267
+ def __iter__(self) -> Iterator[bytes]:
245
268
  return self
246
269
 
247
270
  # Iterate through responses.
248
- def __next__(self):
271
+ def __next__(self) -> bytes:
249
272
  if self._complete:
250
273
  raise StopIteration("Download complete")
251
274
  if not self._iter_downloader:
@@ -263,8 +286,8 @@ class _ChunkIterator(object):
263
286
  return self._get_chunk_data()
264
287
 
265
288
  try:
266
- chunk = next(self._iter_chunks)
267
- self._current_content += self._iter_downloader.yield_chunk(chunk)
289
+ next_chunk = next(self._iter_chunks)
290
+ self._current_content += self._iter_downloader.yield_chunk(next_chunk)[0]
268
291
  except StopIteration as e:
269
292
  self._complete = True
270
293
  if self._current_content:
@@ -277,46 +300,46 @@ class _ChunkIterator(object):
277
300
 
278
301
  next = __next__ # Python 2 compatibility.
279
302
 
280
- def _get_chunk_data(self):
303
+ def _get_chunk_data(self) -> bytes:
281
304
  chunk_data = self._current_content[: self._chunk_size]
282
305
  self._current_content = self._current_content[self._chunk_size:]
283
306
  return chunk_data
284
307
 
285
308
 
286
309
  class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-attributes
287
- """A streaming object to download from Azure Storage.
288
-
289
- :ivar str name:
290
- The name of the blob being downloaded.
291
- :ivar str container:
292
- The name of the container where the blob is.
293
- :ivar ~azure.storage.blob.BlobProperties properties:
294
- The properties of the blob being downloaded. If only a range of the data is being
295
- downloaded, this will be reflected in the properties.
296
- :ivar int size:
297
- The size of the total data in the stream. This will be the byte range if specified,
298
- otherwise the total size of the blob.
310
+ """
311
+ A streaming object to download from Azure Storage.
299
312
  """
300
313
 
314
+ name: str
315
+ """The name of the blob being downloaded."""
316
+ container: str
317
+ """The name of the container where the blob is."""
318
+ properties: "BlobProperties"
319
+ """The properties of the blob being downloaded. If only a range of the data is being
320
+ downloaded, this will be reflected in the properties."""
321
+ size: int
322
+ """The size of the total data in the stream. This will be the byte range if specified,
323
+ otherwise the total size of the blob."""
324
+
301
325
  def __init__(
302
326
  self,
303
- clients=None,
304
- config=None,
305
- start_range=None,
306
- end_range=None,
307
- validate_content=None,
308
- encryption_options=None,
309
- max_concurrency=1,
310
- name=None,
311
- container=None,
312
- encoding=None,
313
- download_cls=None,
314
- **kwargs
315
- ):
327
+ clients: "AzureBlobStorage" = None, # type: ignore [assignment]
328
+ config: "StorageConfiguration" = None, # type: ignore [assignment]
329
+ start_range: Optional[int] = None,
330
+ end_range: Optional[int] = None,
331
+ validate_content: bool = None, # type: ignore [assignment]
332
+ encryption_options: Dict[str, Any] = None, # type: ignore [assignment]
333
+ max_concurrency: int = 1,
334
+ name: str = None, # type: ignore [assignment]
335
+ container: str = None, # type: ignore [assignment]
336
+ encoding: Optional[str] = None,
337
+ download_cls: Optional[Callable] = None,
338
+ **kwargs: Any
339
+ ) -> None:
316
340
  self.name = name
317
341
  self.container = container
318
- self.properties = None
319
- self.size = None
342
+ self.size = 0
320
343
 
321
344
  self._clients = clients
322
345
  self._config = config
@@ -328,14 +351,27 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
328
351
  self._encryption_options = encryption_options or {}
329
352
  self._progress_hook = kwargs.pop('progress_hook', None)
330
353
  self._request_options = kwargs
354
+ self._response = None
331
355
  self._location_mode = None
332
- self._download_complete = False
333
- self._current_content = None
334
- self._file_size = None
356
+ self._current_content: Union[str, bytes] = b''
357
+ self._file_size = 0
335
358
  self._non_empty_ranges = None
336
- self._response = None
337
- self._encryption_data = None
338
- self._offset = 0
359
+ self._encryption_data: Optional["_EncryptionData"] = None
360
+
361
+ # The content download offset, after any processing (decryption), in bytes
362
+ self._download_offset = 0
363
+ # The raw download offset, before processing (decryption), in bytes
364
+ self._raw_download_offset = 0
365
+ # The offset the stream has been read to in bytes or chars depending on mode
366
+ self._read_offset = 0
367
+ # The offset into current_content that has been consumed in bytes or chars depending on mode
368
+ self._current_content_offset = 0
369
+
370
+ self._text_mode: Optional[bool] = None
371
+ self._decoder: Optional["IncrementalDecoder"] = None
372
+ # Whether the current content is the first chunk of download content or not
373
+ self._first_chunk = True
374
+ self._download_start = self._start_range or 0
339
375
 
340
376
  # The cls is passed in via download_cls to avoid conflicting arg name with Generic.__new__
341
377
  # but needs to be changed to cls in the request options.
@@ -347,14 +383,14 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
347
383
  # The service only provides transactional MD5s for chunks under 4MB.
348
384
  # If validate_content is on, get only self.MAX_CHUNK_GET_SIZE for the first
349
385
  # chunk so a transactional MD5 can be retrieved.
350
- self._first_get_size = (
386
+ first_get_size = (
351
387
  self._config.max_single_get_size if not self._validate_content else self._config.max_chunk_get_size
352
388
  )
353
- initial_request_start = self._start_range if self._start_range is not None else 0
354
- if self._end_range is not None and self._end_range - self._start_range < self._first_get_size:
389
+ initial_request_start = self._download_start
390
+ if self._end_range is not None and self._end_range - initial_request_start < first_get_size:
355
391
  initial_request_end = self._end_range
356
392
  else:
357
- initial_request_end = initial_request_start + self._first_get_size - 1
393
+ initial_request_end = initial_request_start + first_get_size - 1
358
394
 
359
395
  self._initial_range, self._initial_offset = process_range_and_offset(
360
396
  initial_request_start,
@@ -365,32 +401,31 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
365
401
  )
366
402
 
367
403
  self._response = self._initial_request()
368
- self.properties = self._response.properties
404
+ self.properties = cast("BlobProperties", self._response.properties)
369
405
  self.properties.name = self.name
370
406
  self.properties.container = self.container
371
407
 
372
- # Set the content length to the download size instead of the size of
373
- # the last range
408
+ # Set the content length to the download size instead of the size of the last range
374
409
  self.properties.size = self.size
375
-
376
- # Overwrite the content range to the user requested range
377
- self.properties.content_range = f"bytes {self._start_range}-{self._end_range}/{self._file_size}"
410
+ self.properties.content_range = (f"bytes {self._download_start}-"
411
+ f"{self._end_range if self._end_range is not None else self._file_size - 1}/"
412
+ f"{self._file_size}")
378
413
 
379
414
  # Overwrite the content MD5 as it is the MD5 for the last range instead
380
415
  # of the stored MD5
381
416
  # TODO: Set to the stored MD5 when the service returns this
382
- self.properties.content_md5 = None
417
+ self.properties.content_md5 = None # type: ignore [attr-defined]
383
418
 
384
419
  def __len__(self):
385
420
  return self.size
386
421
 
387
- def _get_encryption_data_request(self):
422
+ def _get_encryption_data_request(self) -> None:
388
423
  # Save current request cls
389
424
  download_cls = self._request_options.pop('cls', None)
390
425
  # Adjust cls for get_properties
391
426
  self._request_options['cls'] = deserialize_blob_properties
392
427
 
393
- properties = self._clients.blob.get_properties(**self._request_options)
428
+ properties = cast("BlobProperties", self._clients.blob.get_properties(**self._request_options))
394
429
  # This will return None if there is no encryption metadata or there are parsing errors.
395
430
  # That is acceptable here, the proper error will be caught and surfaced when attempting
396
431
  # to decrypt the blob.
@@ -399,6 +434,12 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
399
434
  # Restore cls for download
400
435
  self._request_options['cls'] = download_cls
401
436
 
437
+ @property
438
+ def _download_complete(self):
439
+ if is_encryption_v2(self._encryption_data):
440
+ return self._download_offset >= self.size
441
+ return self._raw_download_offset >= self.size
442
+
402
443
  def _initial_request(self):
403
444
  range_header, range_validation = validate_and_format_range_headers(
404
445
  self._initial_range[0],
@@ -412,14 +453,14 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
412
453
  retry_total = 3
413
454
  while retry_active:
414
455
  try:
415
- location_mode, response = self._clients.blob.download(
456
+ location_mode, response = cast(Tuple[Optional[str], Any], self._clients.blob.download(
416
457
  range=range_header,
417
458
  range_get_content_md5=range_validation,
418
459
  validate_content=self._validate_content,
419
460
  data_stream_total=None,
420
461
  download_stream_current=0,
421
462
  **self._request_options
422
- )
463
+ ))
423
464
 
424
465
  # Check the location we read from to ensure we use the same one
425
466
  # for subsequent requests.
@@ -433,9 +474,9 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
433
474
  # Remove any extra encryption data size from blob size
434
475
  self._file_size = adjust_blob_size_for_encryption(self._file_size, self._encryption_data)
435
476
 
436
- if self._end_range is not None:
477
+ if self._end_range is not None and self._start_range is not None:
437
478
  # Use the end range index unless it is over the end of the file
438
- self.size = min(self._file_size, self._end_range - self._start_range + 1)
479
+ self.size = min(self._file_size - self._start_range, self._end_range - self._start_range + 1)
439
480
  elif self._start_range is not None:
440
481
  self.size = self._file_size - self._start_range
441
482
  else:
@@ -478,6 +519,8 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
478
519
  if retry_total <= 0:
479
520
  raise HttpResponseError(error, error=error) from error
480
521
  time.sleep(1)
522
+ self._download_offset += len(self._current_content)
523
+ self._raw_download_offset += response.content_length
481
524
 
482
525
  # get page ranges to optimize downloading sparse page blob
483
526
  if response.properties.blob_type == 'PageBlob':
@@ -491,33 +534,18 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
491
534
  except HttpResponseError:
492
535
  pass
493
536
 
494
- # If the file is small, the download is complete at this point.
495
- # If file size is large, download the rest of the file in chunks.
496
- # For encryption V2, calculate based on size of decrypted content, not download size.
497
- if is_encryption_v2(self._encryption_data):
498
- self._download_complete = len(self._current_content) >= self.size
499
- else:
500
- self._download_complete = response.properties.size >= self.size
501
-
502
537
  if not self._download_complete and self._request_options.get("modified_access_conditions"):
503
538
  self._request_options["modified_access_conditions"].if_match = response.properties.etag
504
539
 
505
540
  return response
506
541
 
507
- def _get_downloader_start_with_offset(self):
508
- # Start where the initial request download ended
509
- start = self._initial_range[1] + 1
510
- # For encryption V2 only, adjust start to the end of the fetched data rather than download size
511
- if self._encryption_options.get("key") is not None or self._encryption_options.get("resolver") is not None:
512
- start = (self._start_range or 0) + len(self._current_content)
513
-
514
- # Adjust the start based on any data read past the current content
515
- start += (self._offset - len(self._current_content))
516
- return start
542
+ def chunks(self) -> Iterator[bytes]:
543
+ """
544
+ Iterate over chunks in the download stream. Note, the iterator returned will
545
+ iterate over the entire download content, regardless of any data that was
546
+ previously read.
517
547
 
518
- def chunks(self):
519
- # type: () -> Iterator[bytes]
520
- """Iterate over chunks in the download stream.
548
+ NOTE: If the stream has been partially read, some data may be re-downloaded by the iterator.
521
549
 
522
550
  :returns: An iterator of the chunks in the download stream.
523
551
  :rtype: Iterator[bytes]
@@ -531,81 +559,125 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
531
559
  :dedent: 12
532
560
  :caption: Download a blob using chunks().
533
561
  """
534
- if self.size == 0 or self._download_complete:
535
- iter_downloader = None
536
- else:
537
- data_end = self._file_size
538
- if self._end_range is not None:
539
- # Use the end range index unless it is over the end of the file
540
- data_end = min(self._file_size, self._end_range + 1)
562
+ if self._text_mode:
563
+ raise ValueError("Stream has been partially read in text mode. chunks is not supported in text mode.")
564
+ if self._encoding:
565
+ warnings.warn("Encoding is ignored with chunks as only bytes are supported.")
566
+
567
+ iter_downloader = None
568
+ # If we still have the first chunk buffered, use it. Otherwise, download all content again
569
+ if not self._first_chunk or not self._download_complete:
570
+ if self._first_chunk:
571
+ start = self._download_start + len(self._current_content)
572
+ current_progress = len(self._current_content)
573
+ else:
574
+ start = self._download_start
575
+ current_progress = 0
541
576
 
542
- data_start = self._initial_range[1] + 1 # Start where the first download ended
543
- # For encryption, adjust start to the end of the fetched data rather than download size
544
- if self._encryption_options.get("key") is not None or self._encryption_options.get("resolver") is not None:
545
- data_start = (self._start_range or 0) + len(self._current_content)
577
+ end = self._download_start + self.size
546
578
 
547
579
  iter_downloader = _ChunkDownloader(
548
580
  client=self._clients.blob,
549
581
  non_empty_ranges=self._non_empty_ranges,
550
582
  total_size=self.size,
551
583
  chunk_size=self._config.max_chunk_get_size,
552
- current_progress=self._first_get_size,
553
- start_range=data_start,
554
- end_range=data_end,
555
- stream=None,
556
- parallel=False,
584
+ current_progress=current_progress,
585
+ start_range=start,
586
+ end_range=end,
557
587
  validate_content=self._validate_content,
558
588
  encryption_options=self._encryption_options,
559
589
  encryption_data=self._encryption_data,
560
590
  use_location=self._location_mode,
561
591
  **self._request_options
562
592
  )
593
+
594
+ initial_content = self._current_content if self._first_chunk else b''
563
595
  return _ChunkIterator(
564
596
  size=self.size,
565
- content=self._current_content,
597
+ content=cast(bytes, initial_content),
566
598
  downloader=iter_downloader,
567
599
  chunk_size=self._config.max_chunk_get_size)
568
600
 
569
- def read(self, size: Optional[int] = -1) -> T:
601
+ @overload
602
+ def read(self, size: int = -1) -> T:
603
+ ...
604
+
605
+ @overload
606
+ def read(self, *, chars: Optional[int] = None) -> T:
607
+ ...
608
+
609
+ # pylint: disable-next=too-many-statements,too-many-branches
610
+ def read(self, size: int = -1, *, chars: Optional[int] = None) -> T:
570
611
  """
571
- Read up to size bytes from the stream and return them. If size
572
- is unspecified or is -1, all bytes will be read.
612
+ Read the specified bytes or chars from the stream. If `encoding`
613
+ was specified on `download_blob`, it is recommended to use the
614
+ chars parameter to read a specific number of chars to avoid decoding
615
+ errors. If size/chars is unspecified or negative all bytes will be read.
573
616
 
574
- :param Optional[int] size:
617
+ :param int size:
575
618
  The number of bytes to download from the stream. Leave unspecified
576
- or set to -1 to download all bytes.
619
+ or set negative to download all bytes.
620
+ :keyword Optional[int] chars:
621
+ The number of chars to download from the stream. Leave unspecified
622
+ or set negative to download all chars. Note, this can only be used
623
+ when encoding is specified on `download_blob`.
577
624
  :returns:
578
625
  The requested data as bytes or a string if encoding was specified. If
579
626
  the return value is empty, there is no more data to read.
580
627
  :rtype: T
581
628
  """
582
- if size == -1:
583
- return self.readall()
584
- # Empty blob or already read to the end
585
- if size == 0 or self._offset >= self.size:
586
- return b'' if not self._encoding else ''
587
-
588
- stream = BytesIO()
589
- remaining_size = size
590
-
591
- # Start by reading from current_content if there is data left
592
- if self._offset < len(self._current_content):
593
- start = self._offset
594
- length = min(remaining_size, len(self._current_content) - self._offset)
595
- read = stream.write(self._current_content[start:start + length])
596
-
597
- remaining_size -= read
598
- self._offset += read
599
- if self._progress_hook:
600
- self._progress_hook(self._offset, self.size)
601
-
602
- if remaining_size > 0:
603
- start_range = self._get_downloader_start_with_offset()
629
+ if size > -1 and self._encoding:
630
+ warnings.warn(
631
+ "Size parameter specified with text encoding enabled. It is recommended to use chars "
632
+ "to read a specific number of characters instead."
633
+ )
634
+ if size > -1 and chars is not None:
635
+ raise ValueError("Cannot specify both size and chars.")
636
+ if not self._encoding and chars is not None:
637
+ raise ValueError("Must specify encoding to read chars.")
638
+ if self._text_mode and size > -1:
639
+ raise ValueError("Stream has been partially read in text mode. Please use chars.")
640
+ if self._text_mode is False and chars is not None:
641
+ raise ValueError("Stream has been partially read in bytes mode. Please use size.")
604
642
 
605
- # End is the min between the remaining size, the file size, and the end of the specified range
606
- end_range = min(start_range + remaining_size, self._file_size)
607
- if self._end_range is not None:
608
- end_range = min(end_range, self._end_range + 1)
643
+ # Empty blob or already read to the end
644
+ if (size == 0 or chars == 0 or
645
+ (self._download_complete and self._current_content_offset >= len(self._current_content))):
646
+ return b'' if not self._encoding else '' # type: ignore [return-value]
647
+
648
+ if not self._text_mode and chars is not None and self._encoding is not None:
649
+ self._text_mode = True
650
+ self._decoder = codecs.getincrementaldecoder(self._encoding)('strict')
651
+ self._current_content = self._decoder.decode(
652
+ cast(bytes, self._current_content), final=self._download_complete)
653
+ elif self._text_mode is None:
654
+ self._text_mode = False
655
+
656
+ output_stream: Union[BytesIO, StringIO]
657
+ if self._text_mode:
658
+ output_stream = StringIO()
659
+ size = chars if chars else sys.maxsize
660
+ else:
661
+ output_stream = BytesIO()
662
+ size = size if size > 0 else sys.maxsize
663
+ readall = size == sys.maxsize
664
+ count = 0
665
+
666
+ # Start by reading from current_content
667
+ start = self._current_content_offset
668
+ length = min(len(self._current_content) - self._current_content_offset, size - count)
669
+ read = output_stream.write(self._current_content[start:start + length]) # type: ignore [arg-type]
670
+
671
+ count += read
672
+ self._current_content_offset += read
673
+ self._read_offset += read
674
+ self._check_and_report_progress()
675
+
676
+ remaining = size - count
677
+ if remaining > 0 and not self._download_complete:
678
+ # Create a downloader than can download the rest of the file
679
+ start = self._download_start + self._download_offset
680
+ end = self._download_start + self.size
609
681
 
610
682
  parallel = self._max_concurrency > 1
611
683
  downloader = _ChunkDownloader(
@@ -613,10 +685,10 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
613
685
  non_empty_ranges=self._non_empty_ranges,
614
686
  total_size=self.size,
615
687
  chunk_size=self._config.max_chunk_get_size,
616
- current_progress=self._offset,
617
- start_range=start_range,
618
- end_range=end_range,
619
- stream=stream,
688
+ current_progress=self._read_offset,
689
+ start_range=start,
690
+ end_range=end,
691
+ stream=output_stream,
620
692
  parallel=parallel,
621
693
  validate_content=self._validate_content,
622
694
  encryption_options=self._encryption_options,
@@ -625,24 +697,60 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
625
697
  progress_hook=self._progress_hook,
626
698
  **self._request_options
627
699
  )
628
-
629
- if parallel and remaining_size > self._config.max_chunk_get_size:
630
- import concurrent.futures
631
- with concurrent.futures.ThreadPoolExecutor(self._max_concurrency) as executor:
632
- list(executor.map(
700
+ self._first_chunk = False
701
+
702
+ # When reading all data, have the downloader read everything into the stream.
703
+ # Else, read one chunk at a time (using the downloader as an iterator) until
704
+ # the requested size is reached.
705
+ chunks_iter = downloader.get_chunk_offsets()
706
+ if readall and not self._text_mode:
707
+ # Only do parallel if there is more than one chunk left to download
708
+ if parallel and (self.size - self._download_offset) > self._config.max_chunk_get_size:
709
+ import concurrent.futures
710
+ with concurrent.futures.ThreadPoolExecutor(self._max_concurrency) as executor:
711
+ list(executor.map(
633
712
  with_current_context(downloader.process_chunk),
634
713
  downloader.get_chunk_offsets()
635
714
  ))
636
- else:
637
- for chunk in downloader.get_chunk_offsets():
638
- downloader.process_chunk(chunk)
715
+ else:
716
+ for next_chunk in chunks_iter:
717
+ downloader.process_chunk(next_chunk)
639
718
 
640
- self._offset += remaining_size
719
+ self._complete_read()
641
720
 
642
- data = stream.getvalue()
643
- if self._encoding:
644
- return data.decode(self._encoding)
645
- return data
721
+ else:
722
+ while (chunk := next(chunks_iter, None)) is not None and remaining > 0:
723
+ chunk_data, content_length = downloader.yield_chunk(chunk)
724
+ self._download_offset += len(chunk_data)
725
+ self._raw_download_offset += content_length
726
+ if self._text_mode and self._decoder is not None:
727
+ self._current_content = self._decoder.decode(chunk_data, final=self._download_complete)
728
+ else:
729
+ self._current_content = chunk_data
730
+
731
+ if remaining < len(self._current_content):
732
+ read = output_stream.write(self._current_content[:remaining]) # type: ignore [arg-type]
733
+ else:
734
+ read = output_stream.write(self._current_content) # type: ignore [arg-type]
735
+
736
+ self._current_content_offset = read
737
+ self._read_offset += read
738
+ remaining -= read
739
+ self._check_and_report_progress()
740
+
741
+ data = output_stream.getvalue()
742
+ if not self._text_mode and self._encoding:
743
+ try:
744
+ # This is technically incorrect to do, but we have it for backwards compatibility.
745
+ data = cast(bytes, data).decode(self._encoding)
746
+ except UnicodeDecodeError:
747
+ warnings.warn(
748
+ "Encountered a decoding error while decoding blob data from a partial read. "
749
+ "Try using the `chars` keyword instead to read in text mode."
750
+ )
751
+ raise
752
+
753
+ return data # type: ignore [return-value]
646
754
 
647
755
  def readall(self) -> T:
648
756
  """
@@ -652,53 +760,7 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
652
760
  :returns: The requested data as bytes or a string if encoding was specified.
653
761
  :rtype: T
654
762
  """
655
- stream = BytesIO()
656
- self.readinto(stream)
657
- data = stream.getvalue()
658
- if self._encoding:
659
- return data.decode(self._encoding)
660
- return data
661
-
662
- def content_as_bytes(self, max_concurrency=1):
663
- """DEPRECATED: Download the contents of this file.
664
-
665
- This operation is blocking until all data is downloaded.
666
-
667
- This method is deprecated, use func:`readall` instead.
668
-
669
- :param int max_concurrency:
670
- The number of parallel connections with which to download.
671
- :returns: The contents of the file as bytes.
672
- :rtype: bytes
673
- """
674
- warnings.warn(
675
- "content_as_bytes is deprecated, use readall instead",
676
- DeprecationWarning
677
- )
678
- self._max_concurrency = max_concurrency
679
- return self.readall()
680
-
681
- def content_as_text(self, max_concurrency=1, encoding="UTF-8"):
682
- """DEPRECATED: Download the contents of this blob, and decode as text.
683
-
684
- This operation is blocking until all data is downloaded.
685
-
686
- This method is deprecated, use func:`readall` instead.
687
-
688
- :param int max_concurrency:
689
- The number of parallel connections with which to download.
690
- :param str encoding:
691
- Test encoding to decode the downloaded bytes. Default is UTF-8.
692
- :returns: The content of the file as a str.
693
- :rtype: str
694
- """
695
- warnings.warn(
696
- "content_as_text is deprecated, use readall instead",
697
- DeprecationWarning
698
- )
699
- self._max_concurrency = max_concurrency
700
- self._encoding = encoding
701
- return self.readall()
763
+ return self.read()
702
764
 
703
765
  def readinto(self, stream: IO[bytes]) -> int:
704
766
  """Download the contents of this file to a stream.
@@ -710,6 +772,11 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
710
772
  :returns: The number of bytes read.
711
773
  :rtype: int
712
774
  """
775
+ if self._text_mode:
776
+ raise ValueError("Stream has been partially read in text mode. readinto is not supported in text mode.")
777
+ if self._encoding:
778
+ warnings.warn("Encoding is ignored with readinto as only byte streams are supported.")
779
+
713
780
  # The stream must be seekable if parallel download is required
714
781
  parallel = self._max_concurrency > 1
715
782
  if parallel:
@@ -723,35 +790,34 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
723
790
  raise ValueError(error_message) from exc
724
791
 
725
792
  # If some data has been streamed using `read`, only stream the remaining data
726
- remaining_size = self.size - self._offset
793
+ remaining_size = self.size - self._read_offset
727
794
  # Already read to the end
728
795
  if remaining_size <= 0:
729
796
  return 0
730
797
 
731
- # Write the content to the user stream if there is data left
732
- if self._offset < len(self._current_content):
733
- content = self._current_content[self._offset:]
734
- stream.write(content)
735
- self._offset += len(content)
736
- if self._progress_hook:
737
- self._progress_hook(len(content), self.size)
798
+ # Write the current content to the user stream
799
+ current_remaining = len(self._current_content) - self._current_content_offset
800
+ start = self._current_content_offset
801
+ count = stream.write(cast(bytes, self._current_content[start:start + current_remaining]))
802
+
803
+ self._current_content_offset += count
804
+ self._read_offset += count
805
+ if self._progress_hook:
806
+ self._progress_hook(self._read_offset, self.size)
738
807
 
808
+ # If all the data was already downloaded/buffered
739
809
  if self._download_complete:
740
810
  return remaining_size
741
811
 
742
- data_end = self._file_size
743
- if self._end_range is not None:
744
- # Use the length unless it is over the end of the file
745
- data_end = min(self._file_size, self._end_range + 1)
746
-
747
- data_start = self._get_downloader_start_with_offset()
812
+ data_start = self._download_start + self._read_offset
813
+ data_end = self._download_start + self.size
748
814
 
749
815
  downloader = _ChunkDownloader(
750
816
  client=self._clients.blob,
751
817
  non_empty_ranges=self._non_empty_ranges,
752
818
  total_size=self.size,
753
819
  chunk_size=self._config.max_chunk_get_size,
754
- current_progress=self._offset,
820
+ current_progress=self._read_offset,
755
821
  start_range=data_start,
756
822
  end_range=data_end,
757
823
  stream=stream,
@@ -774,8 +840,72 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
774
840
  for chunk in downloader.get_chunk_offsets():
775
841
  downloader.process_chunk(chunk)
776
842
 
843
+ self._complete_read()
777
844
  return remaining_size
778
845
 
846
+ def _complete_read(self):
847
+ """Adjusts all offsets to the end of the download."""
848
+ self._download_offset = self.size
849
+ self._raw_download_offset = self.size
850
+ self._read_offset = self.size
851
+ self._current_content_offset = len(self._current_content)
852
+
853
+ def _check_and_report_progress(self):
854
+ """Reports progress if necessary."""
855
+ # Only report progress at the end of each chunk and use download_offset to always report
856
+ # progress in terms of (approximate) byte count.
857
+ if self._progress_hook and self._current_content_offset == len(self._current_content):
858
+ self._progress_hook(self._download_offset, self.size)
859
+
860
+ def content_as_bytes(self, max_concurrency=1):
861
+ """DEPRECATED: Download the contents of this file.
862
+
863
+ This operation is blocking until all data is downloaded.
864
+
865
+ This method is deprecated, use func:`readall` instead.
866
+
867
+ :param int max_concurrency:
868
+ The number of parallel connections with which to download.
869
+ :returns: The contents of the file as bytes.
870
+ :rtype: bytes
871
+ """
872
+ warnings.warn(
873
+ "content_as_bytes is deprecated, use readall instead",
874
+ DeprecationWarning
875
+ )
876
+ if self._text_mode:
877
+ raise ValueError("Stream has been partially read in text mode. "
878
+ "content_as_bytes is not supported in text mode.")
879
+
880
+ self._max_concurrency = max_concurrency
881
+ return self.readall()
882
+
883
+ def content_as_text(self, max_concurrency=1, encoding="UTF-8"):
884
+ """DEPRECATED: Download the contents of this blob, and decode as text.
885
+
886
+ This operation is blocking until all data is downloaded.
887
+
888
+ This method is deprecated, use func:`readall` instead.
889
+
890
+ :param int max_concurrency:
891
+ The number of parallel connections with which to download.
892
+ :param str encoding:
893
+ Test encoding to decode the downloaded bytes. Default is UTF-8.
894
+ :returns: The content of the file as a str.
895
+ :rtype: str
896
+ """
897
+ warnings.warn(
898
+ "content_as_text is deprecated, use readall instead",
899
+ DeprecationWarning
900
+ )
901
+ if self._text_mode:
902
+ raise ValueError("Stream has been partially read in text mode. "
903
+ "content_as_text is not supported in text mode.")
904
+
905
+ self._max_concurrency = max_concurrency
906
+ self._encoding = encoding
907
+ return self.readall()
908
+
779
909
  def download_to_stream(self, stream, max_concurrency=1):
780
910
  """DEPRECATED: Download the contents of this blob to a stream.
781
911
 
@@ -794,6 +924,10 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
794
924
  "download_to_stream is deprecated, use readinto instead",
795
925
  DeprecationWarning
796
926
  )
927
+ if self._text_mode:
928
+ raise ValueError("Stream has been partially read in text mode. "
929
+ "download_to_stream is not supported in text mode.")
930
+
797
931
  self._max_concurrency = max_concurrency
798
932
  self.readinto(stream)
799
933
  return self.properties