azure-storage-blob 12.21.0b1__py3-none-any.whl → 12.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. azure/storage/blob/__init__.py +19 -18
  2. azure/storage/blob/_blob_client.py +470 -1555
  3. azure/storage/blob/_blob_client_helpers.py +1242 -0
  4. azure/storage/blob/_blob_service_client.py +93 -112
  5. azure/storage/blob/_blob_service_client_helpers.py +27 -0
  6. azure/storage/blob/_container_client.py +169 -376
  7. azure/storage/blob/_container_client_helpers.py +261 -0
  8. azure/storage/blob/_deserialize.py +68 -44
  9. azure/storage/blob/_download.py +375 -241
  10. azure/storage/blob/_encryption.py +14 -7
  11. azure/storage/blob/_generated/py.typed +1 -0
  12. azure/storage/blob/_lease.py +52 -63
  13. azure/storage/blob/_list_blobs_helper.py +129 -135
  14. azure/storage/blob/_models.py +480 -277
  15. azure/storage/blob/_quick_query_helper.py +30 -31
  16. azure/storage/blob/_serialize.py +38 -56
  17. azure/storage/blob/_shared/avro/datafile.py +1 -1
  18. azure/storage/blob/_shared/avro/datafile_async.py +1 -1
  19. azure/storage/blob/_shared/base_client.py +1 -1
  20. azure/storage/blob/_shared/base_client_async.py +1 -1
  21. azure/storage/blob/_shared/policies.py +8 -6
  22. azure/storage/blob/_shared/policies_async.py +3 -1
  23. azure/storage/blob/_shared/response_handlers.py +6 -2
  24. azure/storage/blob/_shared/shared_access_signature.py +2 -2
  25. azure/storage/blob/_shared/uploads.py +1 -1
  26. azure/storage/blob/_shared/uploads_async.py +1 -1
  27. azure/storage/blob/_shared_access_signature.py +70 -53
  28. azure/storage/blob/_upload_helpers.py +75 -68
  29. azure/storage/blob/_version.py +1 -1
  30. azure/storage/blob/aio/__init__.py +19 -11
  31. azure/storage/blob/aio/_blob_client_async.py +554 -301
  32. azure/storage/blob/aio/_blob_service_client_async.py +148 -97
  33. azure/storage/blob/aio/_container_client_async.py +282 -139
  34. azure/storage/blob/aio/_download_async.py +408 -283
  35. azure/storage/blob/aio/_lease_async.py +61 -60
  36. azure/storage/blob/aio/_list_blobs_helper.py +94 -96
  37. azure/storage/blob/aio/_models.py +60 -38
  38. azure/storage/blob/aio/_upload_helpers.py +75 -66
  39. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.22.0.dist-info}/METADATA +7 -7
  40. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.22.0.dist-info}/RECORD +43 -39
  41. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.22.0.dist-info}/WHEEL +1 -1
  42. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.22.0.dist-info}/LICENSE +0 -0
  43. {azure_storage_blob-12.21.0b1.dist-info → azure_storage_blob-12.22.0.dist-info}/top_level.txt +0 -0
@@ -4,19 +4,25 @@
4
4
  # license information.
5
5
  # --------------------------------------------------------------------------
6
6
  # pylint: disable=invalid-overridden-method
7
+ # mypy: disable-error-code=override
7
8
 
9
+ import asyncio
10
+ import codecs
8
11
  import sys
9
12
  import warnings
10
- from io import BytesIO
13
+ from io import BytesIO, StringIO
11
14
  from itertools import islice
12
- from typing import AsyncIterator, Generic, IO, Optional, TypeVar
13
-
14
- import asyncio
15
+ from typing import (
16
+ Any, AsyncIterator, Awaitable,
17
+ Generator, Callable, cast, Dict,
18
+ Generic, IO, Optional, overload,
19
+ Tuple, TypeVar, Union, TYPE_CHECKING
20
+ )
15
21
 
16
22
  from azure.core.exceptions import HttpResponseError
17
23
 
18
24
  from .._shared.request_handlers import validate_and_format_range_headers
19
- from .._shared.response_handlers import process_storage_error, parse_length_from_content_range
25
+ from .._shared.response_handlers import parse_length_from_content_range, process_storage_error
20
26
  from .._deserialize import deserialize_blob_properties, get_page_ranges_result
21
27
  from .._download import process_range_and_offset, _ChunkDownloader
22
28
  from .._encryption import (
@@ -26,17 +32,25 @@ from .._encryption import (
26
32
  parse_encryption_data
27
33
  )
28
34
 
35
+ if TYPE_CHECKING:
36
+ from codecs import IncrementalDecoder
37
+ from .._encryption import _EncryptionData
38
+ from .._generated.aio import AzureBlobStorage
39
+ from .._models import BlobProperties
40
+ from .._shared.models import StorageConfiguration
41
+
42
+
29
43
  T = TypeVar('T', bytes, str)
30
44
 
31
45
 
32
- async def process_content(data, start_offset, end_offset, encryption):
46
+ async def process_content(data: Any, start_offset: int, end_offset: int, encryption: Dict[str, Any]) -> bytes:
33
47
  if data is None:
34
48
  raise ValueError("Response cannot be None.")
35
- content = data.response.body()
49
+ content = cast(bytes, data.response.body())
36
50
  if encryption.get('key') is not None or encryption.get('resolver') is not None:
37
51
  try:
38
52
  return decrypt_blob(
39
- encryption.get('required'),
53
+ encryption.get('required') or False,
40
54
  encryption.get('key'),
41
55
  encryption.get('resolver'),
42
56
  content,
@@ -52,42 +66,45 @@ async def process_content(data, start_offset, end_offset, encryption):
52
66
 
53
67
 
54
68
  class _AsyncChunkDownloader(_ChunkDownloader):
55
- def __init__(self, **kwargs):
69
+ def __init__(self, **kwargs: Any) -> None:
56
70
  super(_AsyncChunkDownloader, self).__init__(**kwargs)
57
- self.stream_lock = asyncio.Lock() if kwargs.get('parallel') else None
58
- self.progress_lock = asyncio.Lock() if kwargs.get('parallel') else None
71
+ self.stream_lock_async = asyncio.Lock() if kwargs.get('parallel') else None
72
+ self.progress_lock_async = asyncio.Lock() if kwargs.get('parallel') else None
59
73
 
60
- async def process_chunk(self, chunk_start):
74
+ async def process_chunk(self, chunk_start: int) -> None:
61
75
  chunk_start, chunk_end = self._calculate_range(chunk_start)
62
- chunk_data = await self._download_chunk(chunk_start, chunk_end - 1)
76
+ chunk_data, _ = await self._download_chunk(chunk_start, chunk_end - 1)
63
77
  length = chunk_end - chunk_start
64
78
  if length > 0:
65
79
  await self._write_to_stream(chunk_data, chunk_start)
66
80
  await self._update_progress(length)
67
81
 
68
- async def yield_chunk(self, chunk_start):
82
+ async def yield_chunk(self, chunk_start: int) -> Tuple[bytes, int]:
69
83
  chunk_start, chunk_end = self._calculate_range(chunk_start)
70
84
  return await self._download_chunk(chunk_start, chunk_end - 1)
71
85
 
72
- async def _update_progress(self, length):
73
- if self.progress_lock:
74
- async with self.progress_lock: # pylint: disable=not-async-context-manager
86
+ async def _update_progress(self, length: int) -> None:
87
+ if self.progress_lock_async:
88
+ async with self.progress_lock_async:
75
89
  self.progress_total += length
76
90
  else:
77
91
  self.progress_total += length
78
92
 
79
93
  if self.progress_hook:
80
- await self.progress_hook(self.progress_total, self.total_size)
94
+ await cast(Callable[[int, Optional[int]], Awaitable[Any]], self.progress_hook)(
95
+ self.progress_total, self.total_size)
81
96
 
82
- async def _write_to_stream(self, chunk_data, chunk_start):
83
- if self.stream_lock:
84
- async with self.stream_lock: # pylint: disable=not-async-context-manager
97
+ async def _write_to_stream(self, chunk_data: bytes, chunk_start: int) -> None:
98
+ if self.stream_lock_async:
99
+ async with self.stream_lock_async:
85
100
  self.stream.seek(self.stream_start + (chunk_start - self.start_index))
86
101
  self.stream.write(chunk_data)
87
102
  else:
88
103
  self.stream.write(chunk_data)
89
104
 
90
- async def _download_chunk(self, chunk_start, chunk_end):
105
+ async def _download_chunk(self, chunk_start: int, chunk_end: int) -> Tuple[bytes, int]:
106
+ if self.encryption_options is None:
107
+ raise ValueError("Required argument is missing: encryption_options")
91
108
  download_range, offset = process_range_and_offset(
92
109
  chunk_start, chunk_end, chunk_end, self.encryption_options, self.encryption_data
93
110
  )
@@ -95,8 +112,8 @@ class _AsyncChunkDownloader(_ChunkDownloader):
95
112
  # No need to download the empty chunk from server if there's no data in the chunk to be downloaded.
96
113
  # Do optimize and create empty chunk locally if condition is met.
97
114
  if self._do_optimize(download_range[0], download_range[1]):
98
- data_size = download_range[1] - download_range[0] + 1
99
- chunk_data = b"\x00" * data_size
115
+ content_length = download_range[1] - download_range[0] + 1
116
+ chunk_data = b"\x00" * content_length
100
117
  else:
101
118
  range_header, range_validation = validate_and_format_range_headers(
102
119
  download_range[0],
@@ -104,51 +121,51 @@ class _AsyncChunkDownloader(_ChunkDownloader):
104
121
  check_content_md5=self.validate_content
105
122
  )
106
123
  try:
107
- _, response = await self.client.download(
124
+ _, response = await cast(Awaitable[Any], self.client.download(
108
125
  range=range_header,
109
126
  range_get_content_md5=range_validation,
110
127
  validate_content=self.validate_content,
111
128
  data_stream_total=self.total_size,
112
129
  download_stream_current=self.progress_total,
113
130
  **self.request_options
114
- )
131
+ ))
115
132
 
116
133
  except HttpResponseError as error:
117
134
  process_storage_error(error)
118
135
 
119
136
  chunk_data = await process_content(response, offset[0], offset[1], self.encryption_options)
120
-
137
+ content_length = response.content_length
121
138
 
122
139
  # This makes sure that if_match is set so that we can validate
123
140
  # that subsequent downloads are to an unmodified blob
124
141
  if self.request_options.get('modified_access_conditions'):
125
142
  self.request_options['modified_access_conditions'].if_match = response.properties.etag
126
143
 
127
- return chunk_data
144
+ return chunk_data, content_length
128
145
 
129
146
 
130
147
  class _AsyncChunkIterator(object):
131
148
  """Async iterator for chunks in blob download stream."""
132
149
 
133
- def __init__(self, size, content, downloader, chunk_size):
150
+ def __init__(self, size: int, content: bytes, downloader: Optional[_AsyncChunkDownloader], chunk_size: int) -> None:
134
151
  self.size = size
135
152
  self._chunk_size = chunk_size
136
153
  self._current_content = content
137
154
  self._iter_downloader = downloader
138
- self._iter_chunks = None
155
+ self._iter_chunks: Optional[Generator[int, None, None]] = None
139
156
  self._complete = size == 0
140
157
 
141
- def __len__(self):
158
+ def __len__(self) -> int:
142
159
  return self.size
143
160
 
144
- def __iter__(self):
161
+ def __iter__(self) -> None:
145
162
  raise TypeError("Async stream must be iterated asynchronously.")
146
163
 
147
- def __aiter__(self):
164
+ def __aiter__(self) -> AsyncIterator[bytes]:
148
165
  return self
149
166
 
150
167
  # Iterate through responses.
151
- async def __anext__(self):
168
+ async def __anext__(self) -> bytes:
152
169
  if self._complete:
153
170
  raise StopAsyncIteration("Download complete")
154
171
  if not self._iter_downloader:
@@ -167,7 +184,7 @@ class _AsyncChunkIterator(object):
167
184
 
168
185
  try:
169
186
  chunk = next(self._iter_chunks)
170
- self._current_content += await self._iter_downloader.yield_chunk(chunk)
187
+ self._current_content += (await self._iter_downloader.yield_chunk(chunk))[0]
171
188
  except StopIteration as exc:
172
189
  self._complete = True
173
190
  # it's likely that there some data left in self._current_content
@@ -177,46 +194,46 @@ class _AsyncChunkIterator(object):
177
194
 
178
195
  return self._get_chunk_data()
179
196
 
180
- def _get_chunk_data(self):
197
+ def _get_chunk_data(self) -> bytes:
181
198
  chunk_data = self._current_content[: self._chunk_size]
182
199
  self._current_content = self._current_content[self._chunk_size:]
183
200
  return chunk_data
184
201
 
185
202
 
186
203
  class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-attributes
187
- """A streaming object to download from Azure Storage.
188
-
189
- :ivar str name:
190
- The name of the blob being downloaded.
191
- :ivar str container:
192
- The name of the container where the blob is.
193
- :ivar ~azure.storage.blob.BlobProperties properties:
194
- The properties of the blob being downloaded. If only a range of the data is being
195
- downloaded, this will be reflected in the properties.
196
- :ivar int size:
197
- The size of the total data in the stream. This will be the byte range if specified,
198
- otherwise the total size of the blob.
204
+ """
205
+ A streaming object to download from Azure Storage.
199
206
  """
200
207
 
208
+ name: str
209
+ """The name of the blob being downloaded."""
210
+ container: str
211
+ """The name of the container where the blob is."""
212
+ properties: "BlobProperties"
213
+ """The properties of the blob being downloaded. If only a range of the data is being
214
+ downloaded, this will be reflected in the properties."""
215
+ size: int
216
+ """The size of the total data in the stream. This will be the byte range if specified,
217
+ otherwise the total size of the blob."""
218
+
201
219
  def __init__(
202
220
  self,
203
- clients=None,
204
- config=None,
205
- start_range=None,
206
- end_range=None,
207
- validate_content=None,
208
- encryption_options=None,
209
- max_concurrency=1,
210
- name=None,
211
- container=None,
212
- encoding=None,
213
- download_cls=None,
214
- **kwargs
215
- ):
221
+ clients: "AzureBlobStorage" = None, # type: ignore [assignment]
222
+ config: "StorageConfiguration" = None, # type: ignore [assignment]
223
+ start_range: Optional[int] = None,
224
+ end_range: Optional[int] = None,
225
+ validate_content: bool = None, # type: ignore [assignment]
226
+ encryption_options: Dict[str, Any] = None, # type: ignore [assignment]
227
+ max_concurrency: int = 1,
228
+ name: str = None, # type: ignore [assignment]
229
+ container: str = None, # type: ignore [assignment]
230
+ encoding: Optional[str] = None,
231
+ download_cls: Optional[Callable] = None,
232
+ **kwargs: Any
233
+ ) -> None:
216
234
  self.name = name
217
235
  self.container = container
218
- self.properties = None
219
- self.size = None
236
+ self.size = 0
220
237
 
221
238
  self._clients = clients
222
239
  self._config = config
@@ -228,38 +245,42 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
228
245
  self._encryption_options = encryption_options or {}
229
246
  self._progress_hook = kwargs.pop('progress_hook', None)
230
247
  self._request_options = kwargs
248
+ self._response = None
231
249
  self._location_mode = None
232
- self._download_complete = False
233
- self._current_content = None
234
- self._file_size = None
250
+ self._current_content: Union[str, bytes] = b''
251
+ self._file_size = 0
235
252
  self._non_empty_ranges = None
236
- self._response = None
237
- self._encryption_data = None
238
- self._offset = 0
239
-
240
- self._initial_range = None
241
- self._initial_offset = None
253
+ self._encryption_data: Optional["_EncryptionData"] = None
254
+
255
+ # The content download offset, after any processing (decryption), in bytes
256
+ self._download_offset = 0
257
+ # The raw download offset, before processing (decryption), in bytes
258
+ self._raw_download_offset = 0
259
+ # The offset the stream has been read to in bytes or chars depending on mode
260
+ self._read_offset = 0
261
+ # The offset into current_content that has been consumed in bytes or chars depending on mode
262
+ self._current_content_offset = 0
263
+
264
+ self._text_mode: Optional[bool] = None
265
+ self._decoder: Optional["IncrementalDecoder"] = None
266
+ # Whether the current content is the first chunk of download content or not
267
+ self._first_chunk = True
268
+ self._download_start = self._start_range or 0
242
269
 
243
270
  # The cls is passed in via download_cls to avoid conflicting arg name with Generic.__new__
244
271
  # but needs to be changed to cls in the request options.
245
272
  self._request_options['cls'] = download_cls
246
273
 
247
- # The service only provides transactional MD5s for chunks under 4MB.
248
- # If validate_content is on, get only self.MAX_CHUNK_GET_SIZE for the first
249
- # chunk so a transactional MD5 can be retrieved.
250
- self._first_get_size = self._config.max_single_get_size if not self._validate_content \
251
- else self._config.max_chunk_get_size
252
-
253
274
  def __len__(self):
254
275
  return self.size
255
276
 
256
- async def _get_encryption_data_request(self):
277
+ async def _get_encryption_data_request(self) -> None:
257
278
  # Save current request cls
258
279
  download_cls = self._request_options.pop('cls', None)
259
280
  # Adjust cls for get_properties
260
281
  self._request_options['cls'] = deserialize_blob_properties
261
282
 
262
- properties = await self._clients.blob.get_properties(**self._request_options)
283
+ properties = cast("BlobProperties", await self._clients.blob.get_properties(**self._request_options))
263
284
  # This will return None if there is no encryption metadata or there are parsing errors.
264
285
  # That is acceptable here, the proper error will be caught and surfaced when attempting
265
286
  # to decrypt the blob.
@@ -268,16 +289,23 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
268
289
  # Restore cls for download
269
290
  self._request_options['cls'] = download_cls
270
291
 
271
- async def _setup(self):
292
+ async def _setup(self) -> None:
272
293
  if self._encryption_options.get("key") is not None or self._encryption_options.get("resolver") is not None:
273
294
  await self._get_encryption_data_request()
274
295
 
296
+ # The service only provides transactional MD5s for chunks under 4MB.
297
+ # If validate_content is on, get only self.MAX_CHUNK_GET_SIZE for the first
298
+ # chunk so a transactional MD5 can be retrieved.
299
+ first_get_size = (
300
+ self._config.max_single_get_size if not self._validate_content else self._config.max_chunk_get_size
301
+ )
275
302
  initial_request_start = self._start_range if self._start_range is not None else 0
276
- if self._end_range is not None and self._end_range - self._start_range < self._first_get_size:
303
+ if self._end_range is not None and self._end_range - initial_request_start < first_get_size:
277
304
  initial_request_end = self._end_range
278
305
  else:
279
- initial_request_end = initial_request_start + self._first_get_size - 1
306
+ initial_request_end = initial_request_start + first_get_size - 1
280
307
 
308
+ # pylint: disable-next=attribute-defined-outside-init
281
309
  self._initial_range, self._initial_offset = process_range_and_offset(
282
310
  initial_request_start,
283
311
  initial_request_end,
@@ -287,44 +315,26 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
287
315
  )
288
316
 
289
317
  self._response = await self._initial_request()
290
-
291
- self.properties = self._response.properties
318
+ self.properties = cast("BlobProperties", self._response.properties) # type: ignore [attr-defined]
292
319
  self.properties.name = self.name
293
320
  self.properties.container = self.container
294
321
 
295
- # Set the content length to the download size instead of the size of
296
- # the last range
297
- initial_size = self._response.properties.size
322
+ # Set the content length to the download size instead of the size of the last range
298
323
  self.properties.size = self.size
299
-
300
- # Overwrite the content range to the user requested range
301
- self.properties.content_range = f'bytes {self._start_range}-{self._end_range}/{self._file_size}'
324
+ self.properties.content_range = (f"bytes {self._download_start}-"
325
+ f"{self._end_range if self._end_range is not None else self._file_size - 1}/"
326
+ f"{self._file_size}")
302
327
 
303
328
  # Overwrite the content MD5 as it is the MD5 for the last range instead
304
329
  # of the stored MD5
305
330
  # TODO: Set to the stored MD5 when the service returns this
306
- self.properties.content_md5 = None
331
+ self.properties.content_md5 = None # type: ignore [attr-defined]
307
332
 
308
- if self.size == 0:
309
- self._current_content = b""
310
- else:
311
- self._current_content = await process_content(
312
- self._response,
313
- self._initial_offset[0],
314
- self._initial_offset[1],
315
- self._encryption_options
316
- )
317
-
318
- # If the file is small, the download is complete at this point.
319
- # If file size is large, download the rest of the file in chunks.
320
- # For encryption V2, calculate based on size of decrypted content, not download size.
333
+ @property
334
+ def _download_complete(self):
321
335
  if is_encryption_v2(self._encryption_data):
322
- self._download_complete = len(self._current_content) >= self.size
323
- else:
324
- self._download_complete = initial_size >= self.size
325
-
326
- if not self._download_complete and self._request_options.get("modified_access_conditions"):
327
- self._request_options["modified_access_conditions"].if_match = self._response.properties.etag
336
+ return self._download_offset >= self.size
337
+ return self._raw_download_offset >= self.size
328
338
 
329
339
  async def _initial_request(self):
330
340
  range_header, range_validation = validate_and_format_range_headers(
@@ -335,13 +345,13 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
335
345
  check_content_md5=self._validate_content)
336
346
 
337
347
  try:
338
- location_mode, response = await self._clients.blob.download(
348
+ location_mode, response = cast(Tuple[Optional[str], Any], await self._clients.blob.download(
339
349
  range=range_header,
340
350
  range_get_content_md5=range_validation,
341
351
  validate_content=self._validate_content,
342
352
  data_stream_total=None,
343
353
  download_stream_current=0,
344
- **self._request_options)
354
+ **self._request_options))
345
355
 
346
356
  # Check the location we read from to ensure we use the same one
347
357
  # for subsequent requests.
@@ -355,9 +365,9 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
355
365
  # Remove any extra encryption data size from blob size
356
366
  self._file_size = adjust_blob_size_for_encryption(self._file_size, self._encryption_data)
357
367
 
358
- if self._end_range is not None:
368
+ if self._end_range is not None and self._start_range is not None:
359
369
  # Use the length unless it is over the end of the file
360
- self.size = min(self._file_size, self._end_range - self._start_range + 1)
370
+ self.size = min(self._file_size - self._start_range, self._end_range - self._start_range + 1)
361
371
  elif self._start_range is not None:
362
372
  self.size = self._file_size - self._start_range
363
373
  else:
@@ -369,11 +379,11 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
369
379
  # request a range, do a regular get request in order to get
370
380
  # any properties.
371
381
  try:
372
- _, response = await self._clients.blob.download(
382
+ _, response = cast(Tuple[Optional[Any], Any], await self._clients.blob.download(
373
383
  validate_content=self._validate_content,
374
384
  data_stream_total=0,
375
385
  download_stream_current=0,
376
- **self._request_options)
386
+ **self._request_options))
377
387
  except HttpResponseError as e:
378
388
  process_storage_error(e)
379
389
 
@@ -383,6 +393,18 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
383
393
  else:
384
394
  process_storage_error(error)
385
395
 
396
+ if self.size == 0:
397
+ self._current_content = b""
398
+ else:
399
+ self._current_content = await process_content(
400
+ response,
401
+ self._initial_offset[0],
402
+ self._initial_offset[1],
403
+ self._encryption_options
404
+ )
405
+ self._download_offset += len(self._current_content)
406
+ self._raw_download_offset += response.content_length
407
+
386
408
  # get page ranges to optimize downloading sparse page blob
387
409
  if response.properties.blob_type == 'PageBlob':
388
410
  try:
@@ -391,22 +413,18 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
391
413
  except HttpResponseError:
392
414
  pass
393
415
 
394
- return response
416
+ if not self._download_complete and self._request_options.get("modified_access_conditions"):
417
+ self._request_options["modified_access_conditions"].if_match = response.properties.etag
395
418
 
396
- def _get_downloader_start_with_offset(self):
397
- # Start where the initial request download ended
398
- start = self._initial_range[1] + 1
399
- # For encryption V2 only, adjust start to the end of the fetched data rather than download size
400
- if self._encryption_options.get("key") is not None or self._encryption_options.get("resolver") is not None:
401
- start = (self._start_range or 0) + len(self._current_content)
419
+ return response
402
420
 
403
- # Adjust the start based on any data read past the current content
404
- start += (self._offset - len(self._current_content))
405
- return start
421
+ def chunks(self) -> AsyncIterator[bytes]:
422
+ """
423
+ Iterate over chunks in the download stream. Note, the iterator returned will
424
+ iterate over the entire download content, regardless of any data that was
425
+ previously read.
406
426
 
407
- def chunks(self):
408
- # type: () -> AsyncIterator[bytes]
409
- """Iterate over chunks in the download stream.
427
+ NOTE: If the stream has been partially read, some data may be re-downloaded by the iterator.
410
428
 
411
429
  :returns: An async iterator of the chunks in the download stream.
412
430
  :rtype: AsyncIterator[bytes]
@@ -420,79 +438,125 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
420
438
  :dedent: 16
421
439
  :caption: Download a blob using chunks().
422
440
  """
423
- if self.size == 0 or self._download_complete:
424
- iter_downloader = None
425
- else:
426
- data_end = self._file_size
427
- data_start = self._initial_range[1] + 1 # Start where the first download ended
428
- # For encryption, adjust start to the end of the fetched data rather than download size
429
- if self._encryption_options.get("key") is not None or self._encryption_options.get("resolver") is not None:
430
- data_start = (self._start_range or 0) + len(self._current_content)
441
+ if self._text_mode:
442
+ raise ValueError("Stream has been partially read in text mode. chunks is not supported in text mode.")
443
+ if self._encoding:
444
+ warnings.warn("Encoding is ignored with chunks as only bytes are supported.")
445
+
446
+ iter_downloader = None
447
+ # If we still have the first chunk buffered, use it. Otherwise, download all content again
448
+ if not self._first_chunk or not self._download_complete:
449
+ if self._first_chunk:
450
+ start = self._download_start + len(self._current_content)
451
+ current_progress = len(self._current_content)
452
+ else:
453
+ start = self._download_start
454
+ current_progress = 0
455
+
456
+ end = self._download_start + self.size
431
457
 
432
- if self._end_range is not None:
433
- # Use the length unless it is over the end of the file
434
- data_end = min(self._file_size, self._end_range + 1)
435
458
  iter_downloader = _AsyncChunkDownloader(
436
459
  client=self._clients.blob,
437
460
  non_empty_ranges=self._non_empty_ranges,
438
461
  total_size=self.size,
439
462
  chunk_size=self._config.max_chunk_get_size,
440
- current_progress=self._first_get_size,
441
- start_range=data_start,
442
- end_range=data_end,
443
- stream=None,
444
- parallel=False,
463
+ current_progress=current_progress,
464
+ start_range=start,
465
+ end_range=end,
445
466
  validate_content=self._validate_content,
446
467
  encryption_options=self._encryption_options,
447
468
  encryption_data=self._encryption_data,
448
469
  use_location=self._location_mode,
449
- **self._request_options)
470
+ **self._request_options
471
+ )
472
+
473
+ initial_content = self._current_content if self._first_chunk else b''
450
474
  return _AsyncChunkIterator(
451
475
  size=self.size,
452
- content=self._current_content,
476
+ content=cast(bytes, initial_content),
453
477
  downloader=iter_downloader,
454
478
  chunk_size=self._config.max_chunk_get_size)
455
479
 
456
- async def read(self, size: Optional[int] = -1) -> T:
480
+ @overload
481
+ async def read(self, size: int = -1) -> T:
482
+ ...
483
+
484
+ @overload
485
+ async def read(self, *, chars: Optional[int] = None) -> T:
486
+ ...
487
+
488
+ # pylint: disable-next=too-many-statements,too-many-branches
489
+ async def read(self, size: int = -1, *, chars: Optional[int] = None) -> T:
457
490
  """
458
- Read up to size bytes from the stream and return them. If size
459
- is unspecified or is -1, all bytes will be read.
491
+ Read the specified bytes or chars from the stream. If `encoding`
492
+ was specified on `download_blob`, it is recommended to use the
493
+ chars parameter to read a specific number of chars to avoid decoding
494
+ errors. If size/chars is unspecified or negative all bytes will be read.
460
495
 
461
- :param Optional[int] size:
496
+ :param int size:
462
497
  The number of bytes to download from the stream. Leave unspecified
463
- or set to -1 to download all bytes.
498
+ or set negative to download all bytes.
499
+ :keyword Optional[int] chars:
500
+ The number of chars to download from the stream. Leave unspecified
501
+ or set negative to download all chars. Note, this can only be used
502
+ when encoding is specified on `download_blob`.
464
503
  :returns:
465
504
  The requested data as bytes or a string if encoding was specified. If
466
505
  the return value is empty, there is no more data to read.
467
506
  :rtype: T
468
507
  """
469
- if size == -1:
470
- return await self.readall()
471
- # Empty blob or already read to the end
472
- if size == 0 or self._offset >= self.size:
473
- return b'' if not self._encoding else ''
474
-
475
- stream = BytesIO()
476
- remaining_size = size
477
-
478
- # Start by reading from current_content if there is data left
479
- if self._offset < len(self._current_content):
480
- start = self._offset
481
- length = min(remaining_size, len(self._current_content) - self._offset)
482
- read = stream.write(self._current_content[start:start + length])
483
-
484
- remaining_size -= read
485
- self._offset += read
486
- if self._progress_hook:
487
- await self._progress_hook(self._offset, self.size)
488
-
489
- if remaining_size > 0:
490
- start_range = self._get_downloader_start_with_offset()
508
+ if size > -1 and self._encoding:
509
+ warnings.warn(
510
+ "Size parameter specified with text encoding enabled. It is recommended to use chars "
511
+ "to read a specific number of characters instead."
512
+ )
513
+ if size > -1 and chars is not None:
514
+ raise ValueError("Cannot specify both size and chars.")
515
+ if not self._encoding and chars is not None:
516
+ raise ValueError("Must specify encoding to read chars.")
517
+ if self._text_mode and size > -1:
518
+ raise ValueError("Stream has been partially read in text mode. Please use chars.")
519
+ if self._text_mode is False and chars is not None:
520
+ raise ValueError("Stream has been partially read in bytes mode. Please use size.")
491
521
 
492
- # End is the min between the remaining size, the file size, and the end of the specified range
493
- end_range = min(start_range + remaining_size, self._file_size)
494
- if self._end_range is not None:
495
- end_range = min(end_range, self._end_range + 1)
522
+ # Empty blob or already read to the end
523
+ if (size == 0 or chars == 0 or
524
+ (self._download_complete and self._current_content_offset >= len(self._current_content))):
525
+ return b'' if not self._encoding else '' # type: ignore [return-value]
526
+
527
+ if not self._text_mode and chars is not None and self._encoding is not None:
528
+ self._text_mode = True
529
+ self._decoder = codecs.getincrementaldecoder(self._encoding)('strict')
530
+ self._current_content = self._decoder.decode(
531
+ cast(bytes, self._current_content), final=self._download_complete)
532
+ elif self._text_mode is None:
533
+ self._text_mode = False
534
+
535
+ output_stream: Union[BytesIO, StringIO]
536
+ if self._text_mode:
537
+ output_stream = StringIO()
538
+ size = chars if chars else sys.maxsize
539
+ else:
540
+ output_stream = BytesIO()
541
+ size = size if size > 0 else sys.maxsize
542
+ readall = size == sys.maxsize
543
+ count = 0
544
+
545
+ # Start by reading from current_content
546
+ start = self._current_content_offset
547
+ length = min(len(self._current_content) - self._current_content_offset, size - count)
548
+ read = output_stream.write(self._current_content[start:start + length]) # type: ignore [arg-type]
549
+
550
+ count += read
551
+ self._current_content_offset += read
552
+ self._read_offset += read
553
+ await self._check_and_report_progress()
554
+
555
+ remaining = size - count
556
+ if remaining > 0 and not self._download_complete:
557
+ # Create a downloader than can download the rest of the file
558
+ start = self._download_start + self._download_offset
559
+ end = self._download_start + self.size
496
560
 
497
561
  parallel = self._max_concurrency > 1
498
562
  downloader = _AsyncChunkDownloader(
@@ -500,10 +564,10 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
500
564
  non_empty_ranges=self._non_empty_ranges,
501
565
  total_size=self.size,
502
566
  chunk_size=self._config.max_chunk_get_size,
503
- current_progress=self._offset,
504
- start_range=start_range,
505
- end_range=end_range,
506
- stream=stream,
567
+ current_progress=self._read_offset,
568
+ start_range=start,
569
+ end_range=end,
570
+ stream=output_stream,
507
571
  parallel=parallel,
508
572
  validate_content=self._validate_content,
509
573
  encryption_options=self._encryption_options,
@@ -512,43 +576,77 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
512
576
  progress_hook=self._progress_hook,
513
577
  **self._request_options
514
578
  )
579
+ self._first_chunk = False
580
+
581
+ # When reading all data, have the downloader read everything into the stream.
582
+ # Else, read one chunk at a time (using the downloader as an iterator) until
583
+ # the requested size is reached.
584
+ chunks_iter = downloader.get_chunk_offsets()
585
+ if readall and not self._text_mode:
586
+ running_futures: Any = [
587
+ asyncio.ensure_future(downloader.process_chunk(d))
588
+ for d in islice(chunks_iter, 0, self._max_concurrency)
589
+ ]
590
+ while running_futures:
591
+ # Wait for some download to finish before adding a new one
592
+ done, running_futures = await asyncio.wait(
593
+ running_futures, return_when=asyncio.FIRST_COMPLETED)
594
+ try:
595
+ for task in done:
596
+ task.result()
597
+ except HttpResponseError as error:
598
+ process_storage_error(error)
599
+ try:
600
+ for _ in range(0, len(done)):
601
+ next_chunk = next(chunks_iter)
602
+ running_futures.add(asyncio.ensure_future(downloader.process_chunk(next_chunk)))
603
+ except StopIteration:
604
+ break
605
+
606
+ if running_futures:
607
+ # Wait for the remaining downloads to finish
608
+ done, _running_futures = await asyncio.wait(running_futures)
609
+ try:
610
+ for task in done:
611
+ task.result()
612
+ except HttpResponseError as error:
613
+ process_storage_error(error)
614
+
615
+ self._complete_read()
515
616
 
516
- dl_tasks = downloader.get_chunk_offsets()
517
- running_futures = [
518
- asyncio.ensure_future(downloader.process_chunk(d))
519
- for d in islice(dl_tasks, 0, self._max_concurrency)
520
- ]
521
- while running_futures:
522
- # Wait for some download to finish before adding a new one
523
- done, running_futures = await asyncio.wait(
524
- running_futures, return_when=asyncio.FIRST_COMPLETED)
525
- try:
526
- for task in done:
527
- task.result()
528
- except HttpResponseError as error:
529
- process_storage_error(error)
530
- try:
531
- for _ in range(0, len(done)):
532
- next_chunk = next(dl_tasks)
533
- running_futures.add(asyncio.ensure_future(downloader.process_chunk(next_chunk)))
534
- except StopIteration:
535
- break
536
-
537
- if running_futures:
538
- # Wait for the remaining downloads to finish
539
- done, _running_futures = await asyncio.wait(running_futures)
540
- try:
541
- for task in done:
542
- task.result()
543
- except HttpResponseError as error:
544
- process_storage_error(error)
617
+ else:
618
+ while (chunk := next(chunks_iter, None)) is not None and remaining > 0:
619
+ chunk_data, content_length = await downloader.yield_chunk(chunk)
620
+ self._download_offset += len(chunk_data)
621
+ self._raw_download_offset += content_length
622
+ if self._text_mode and self._decoder is not None:
623
+ self._current_content = self._decoder.decode(chunk_data, final=self._download_complete)
624
+ else:
625
+ self._current_content = chunk_data
626
+
627
+ if remaining < len(self._current_content):
628
+ read = output_stream.write(self._current_content[:remaining]) # type: ignore [arg-type]
629
+ else:
630
+ read = output_stream.write(self._current_content) # type: ignore [arg-type]
631
+
632
+ self._current_content_offset = read
633
+ self._read_offset += read
634
+ remaining -= read
635
+ await self._check_and_report_progress()
636
+
637
+ data = output_stream.getvalue()
638
+ if not self._text_mode and self._encoding:
639
+ try:
640
+ # This is technically incorrect to do, but we have it for backwards compatibility.
641
+ data = cast(bytes, data).decode(self._encoding)
642
+ except UnicodeDecodeError:
643
+ warnings.warn(
644
+ "Encountered a decoding error while decoding blob data from a partial read. "
645
+ "Try using the `chars` keyword instead to read in text mode."
646
+ )
647
+ raise
545
648
 
546
- self._offset += remaining_size
547
-
548
- data = stream.getvalue()
549
- if self._encoding:
550
- return data.decode(self._encoding)
551
- return data
649
+ return data # type: ignore [return-value]
552
650
 
553
651
  async def readall(self) -> T:
554
652
  """
@@ -558,53 +656,7 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
558
656
  :returns: The requested data as bytes or a string if encoding was specified.
559
657
  :rtype: T
560
658
  """
561
- stream = BytesIO()
562
- await self.readinto(stream)
563
- data = stream.getvalue()
564
- if self._encoding:
565
- return data.decode(self._encoding)
566
- return data
567
-
568
- async def content_as_bytes(self, max_concurrency=1):
569
- """DEPRECATED: Download the contents of this file.
570
-
571
- This operation is blocking until all data is downloaded.
572
-
573
- This method is deprecated, use func:`readall` instead.
574
-
575
- :param int max_concurrency:
576
- The number of parallel connections with which to download.
577
- :returns: The contents of the file as bytes.
578
- :rtype: bytes
579
- """
580
- warnings.warn(
581
- "content_as_bytes is deprecated, use readall instead",
582
- DeprecationWarning
583
- )
584
- self._max_concurrency = max_concurrency
585
- return await self.readall()
586
-
587
- async def content_as_text(self, max_concurrency=1, encoding="UTF-8"):
588
- """DEPRECATED: Download the contents of this blob, and decode as text.
589
-
590
- This operation is blocking until all data is downloaded.
591
-
592
- This method is deprecated, use func:`readall` instead.
593
-
594
- :param int max_concurrency:
595
- The number of parallel connections with which to download.
596
- :param str encoding:
597
- Test encoding to decode the downloaded bytes. Default is UTF-8.
598
- :returns: The content of the file as a str.
599
- :rtype: str
600
- """
601
- warnings.warn(
602
- "content_as_text is deprecated, use readall instead",
603
- DeprecationWarning
604
- )
605
- self._max_concurrency = max_concurrency
606
- self._encoding = encoding
607
- return await self.readall()
659
+ return await self.read()
608
660
 
609
661
  async def readinto(self, stream: IO[bytes]) -> int:
610
662
  """Download the contents of this blob to a stream.
@@ -616,6 +668,11 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
616
668
  :returns: The number of bytes read.
617
669
  :rtype: int
618
670
  """
671
+ if self._text_mode:
672
+ raise ValueError("Stream has been partially read in text mode. readinto is not supported in text mode.")
673
+ if self._encoding:
674
+ warnings.warn("Encoding is ignored with readinto as only byte streams are supported.")
675
+
619
676
  # the stream must be seekable if parallel download is required
620
677
  parallel = self._max_concurrency > 1
621
678
  if parallel:
@@ -629,35 +686,34 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
629
686
  raise ValueError(error_message) from exc
630
687
 
631
688
  # If some data has been streamed using `read`, only stream the remaining data
632
- remaining_size = self.size - self._offset
689
+ remaining_size = self.size - self._read_offset
633
690
  # Already read to the end
634
691
  if remaining_size <= 0:
635
692
  return 0
636
693
 
637
- # Write the content to the user stream if there is data left
638
- if self._offset < len(self._current_content):
639
- content = self._current_content[self._offset:]
640
- stream.write(content)
641
- self._offset += len(content)
642
- if self._progress_hook:
643
- await self._progress_hook(len(content), self.size)
694
+ # Write the current content to the user stream
695
+ current_remaining = len(self._current_content) - self._current_content_offset
696
+ start = self._current_content_offset
697
+ count = stream.write(cast(bytes, self._current_content[start:start + current_remaining]))
644
698
 
699
+ self._current_content_offset += count
700
+ self._read_offset += count
701
+ if self._progress_hook:
702
+ await self._progress_hook(self._read_offset, self.size)
703
+
704
+ # If all the data was already downloaded/buffered
645
705
  if self._download_complete:
646
706
  return remaining_size
647
707
 
648
- data_end = self._file_size
649
- if self._end_range is not None:
650
- # Use the length unless it is over the end of the file
651
- data_end = min(self._file_size, self._end_range + 1)
652
-
653
- data_start = self._get_downloader_start_with_offset()
708
+ data_start = self._download_start + self._read_offset
709
+ data_end = self._download_start + self.size
654
710
 
655
711
  downloader = _AsyncChunkDownloader(
656
712
  client=self._clients.blob,
657
713
  non_empty_ranges=self._non_empty_ranges,
658
714
  total_size=self.size,
659
715
  chunk_size=self._config.max_chunk_get_size,
660
- current_progress=self._offset,
716
+ current_progress=self._read_offset,
661
717
  start_range=data_start,
662
718
  end_range=data_end,
663
719
  stream=stream,
@@ -667,13 +723,14 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
667
723
  encryption_data=self._encryption_data,
668
724
  use_location=self._location_mode,
669
725
  progress_hook=self._progress_hook,
670
- **self._request_options)
726
+ **self._request_options
727
+ )
671
728
 
672
729
  dl_tasks = downloader.get_chunk_offsets()
673
- running_futures = [
730
+ running_futures = {
674
731
  asyncio.ensure_future(downloader.process_chunk(d))
675
732
  for d in islice(dl_tasks, 0, self._max_concurrency)
676
- ]
733
+ }
677
734
  while running_futures:
678
735
  # Wait for some download to finish before adding a new one
679
736
  done, running_futures = await asyncio.wait(
@@ -699,8 +756,72 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
699
756
  except HttpResponseError as error:
700
757
  process_storage_error(error)
701
758
 
759
+ self._complete_read()
702
760
  return remaining_size
703
761
 
762
+ def _complete_read(self):
763
+ """Adjusts all offsets to the end of the download."""
764
+ self._download_offset = self.size
765
+ self._raw_download_offset = self.size
766
+ self._read_offset = self.size
767
+ self._current_content_offset = len(self._current_content)
768
+
769
+ async def _check_and_report_progress(self):
770
+ """Reports progress if necessary."""
771
+ # Only report progress at the end of each chunk and use download_offset to always report
772
+ # progress in terms of (approximate) byte count.
773
+ if self._progress_hook and self._current_content_offset == len(self._current_content):
774
+ await self._progress_hook(self._download_offset, self.size)
775
+
776
+ async def content_as_bytes(self, max_concurrency=1):
777
+ """DEPRECATED: Download the contents of this file.
778
+
779
+ This operation is blocking until all data is downloaded.
780
+
781
+ This method is deprecated, use func:`readall` instead.
782
+
783
+ :param int max_concurrency:
784
+ The number of parallel connections with which to download.
785
+ :returns: The contents of the file as bytes.
786
+ :rtype: bytes
787
+ """
788
+ warnings.warn(
789
+ "content_as_bytes is deprecated, use readall instead",
790
+ DeprecationWarning
791
+ )
792
+ if self._text_mode:
793
+ raise ValueError("Stream has been partially read in text mode. "
794
+ "content_as_bytes is not supported in text mode.")
795
+
796
+ self._max_concurrency = max_concurrency
797
+ return await self.readall()
798
+
799
+ async def content_as_text(self, max_concurrency=1, encoding="UTF-8"):
800
+ """DEPRECATED: Download the contents of this blob, and decode as text.
801
+
802
+ This operation is blocking until all data is downloaded.
803
+
804
+ This method is deprecated, use func:`readall` instead.
805
+
806
+ :param int max_concurrency:
807
+ The number of parallel connections with which to download.
808
+ :param str encoding:
809
+ Test encoding to decode the downloaded bytes. Default is UTF-8.
810
+ :returns: The content of the file as a str.
811
+ :rtype: str
812
+ """
813
+ warnings.warn(
814
+ "content_as_text is deprecated, use readall instead",
815
+ DeprecationWarning
816
+ )
817
+ if self._text_mode:
818
+ raise ValueError("Stream has been partially read in text mode. "
819
+ "content_as_text is not supported in text mode.")
820
+
821
+ self._max_concurrency = max_concurrency
822
+ self._encoding = encoding
823
+ return await self.readall()
824
+
704
825
  async def download_to_stream(self, stream, max_concurrency=1):
705
826
  """DEPRECATED: Download the contents of this blob to a stream.
706
827
 
@@ -719,6 +840,10 @@ class StorageStreamDownloader(Generic[T]): # pylint: disable=too-many-instance-
719
840
  "download_to_stream is deprecated, use readinto instead",
720
841
  DeprecationWarning
721
842
  )
843
+ if self._text_mode:
844
+ raise ValueError("Stream has been partially read in text mode. "
845
+ "download_to_stream is not supported in text mode.")
846
+
722
847
  self._max_concurrency = max_concurrency
723
848
  await self.readinto(stream)
724
849
  return self.properties