deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/aws/s3u.py CHANGED
@@ -1,51 +1,58 @@
1
- import ray
2
- import deltacat.aws.clients as aws_utils
3
1
  import logging
4
2
  import multiprocessing
5
- import s3fs
6
- import pyarrow as pa
7
-
8
3
  from functools import partial
4
+ from typing import Any, Callable, Dict, Generator, List, Optional, Union
9
5
  from uuid import uuid4
10
6
 
11
- from ray.types import ObjectRef
12
- from ray.data.datasource import BlockWritePathProvider
7
+ import pyarrow as pa
8
+ import ray
9
+ import s3fs
10
+ from boto3.resources.base import ServiceResource
11
+ from botocore.client import BaseClient
12
+ from botocore.exceptions import ClientError
13
13
  from ray.data.block import Block, BlockAccessor, BlockMetadata
14
+ from ray.data.datasource import BlockWritePathProvider
15
+ from ray.types import ObjectRef
16
+ from tenacity import (
17
+ Retrying,
18
+ retry_if_exception_type,
19
+ retry_if_not_exception_type,
20
+ stop_after_delay,
21
+ wait_random_exponential,
22
+ )
14
23
 
24
+ import deltacat.aws.clients as aws_utils
15
25
  from deltacat import logs
16
- from deltacat.storage import LocalTable, LocalDataset, DistributedDataset, \
17
- Manifest, ManifestEntry, ManifestEntryList
18
26
  from deltacat.aws.constants import TIMEOUT_ERROR_CODES
19
- from deltacat.exceptions import RetryableError, NonRetryableError
20
- from deltacat.types.media import ContentType, ContentEncoding
21
- from deltacat.types.tables import TABLE_TYPE_TO_READER_FUNC, \
22
- TABLE_CLASS_TO_SIZE_FUNC, get_table_length
23
- from deltacat.types.media import TableType
27
+ from deltacat.exceptions import NonRetryableError, RetryableError
28
+ from deltacat.storage import (
29
+ DistributedDataset,
30
+ LocalDataset,
31
+ LocalTable,
32
+ Manifest,
33
+ ManifestEntry,
34
+ ManifestEntryList,
35
+ )
36
+ from deltacat.types.media import ContentEncoding, ContentType, TableType
37
+ from deltacat.types.tables import (
38
+ TABLE_CLASS_TO_SIZE_FUNC,
39
+ TABLE_TYPE_TO_READER_FUNC,
40
+ get_table_length,
41
+ )
24
42
  from deltacat.utils.common import ReadKwargsProvider
25
43
 
26
- from boto3.resources.base import ServiceResource
27
- from botocore.client import BaseClient
28
- from botocore.exceptions import ClientError
29
- from tenacity import Retrying
30
- from tenacity import wait_random_exponential
31
- from tenacity import stop_after_delay
32
- from tenacity import retry_if_exception_type, retry_if_not_exception_type
33
-
34
- from typing import Any, Callable, Dict, List, Optional, Generator, Union
35
-
36
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
37
45
 
38
- # TODO(raghumdani): refactor redshift datasource to reuse the
46
+ # TODO(raghumdani): refactor redshift datasource to reuse the
39
47
  # same module for writing output files.
48
+
49
+
40
50
  class CapturedBlockWritePaths:
41
51
  def __init__(self):
42
52
  self._write_paths: List[str] = []
43
53
  self._block_refs: List[ObjectRef[Block]] = []
44
54
 
45
- def extend(
46
- self,
47
- write_paths: List[str],
48
- block_refs: List[ObjectRef[Block]]):
55
+ def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
49
56
  try:
50
57
  iter(write_paths)
51
58
  except TypeError:
@@ -70,6 +77,7 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
70
77
  """Block write path provider implementation that writes each
71
78
  dataset block out to a file of the form: {base_path}/{uuid}
72
79
  """
80
+
73
81
  def __init__(self, capture_object: CapturedBlockWritePaths):
74
82
  self.write_paths: List[str] = []
75
83
  self.block_refs: List[ObjectRef[Block]] = []
@@ -83,14 +91,15 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
83
91
  )
84
92
 
85
93
  def _get_write_path_for_block(
86
- self,
87
- base_path: str,
88
- *,
89
- filesystem: Optional[pa.filesystem.FileSystem] = None,
90
- dataset_uuid: Optional[str] = None,
91
- block: Optional[ObjectRef[Block]] = None,
92
- block_index: Optional[int] = None,
93
- file_format: Optional[str] = None) -> str:
94
+ self,
95
+ base_path: str,
96
+ *,
97
+ filesystem: Optional[pa.filesystem.FileSystem] = None,
98
+ dataset_uuid: Optional[str] = None,
99
+ block: Optional[ObjectRef[Block]] = None,
100
+ block_index: Optional[int] = None,
101
+ file_format: Optional[str] = None,
102
+ ) -> str:
94
103
  write_path = f"{base_path}/{str(uuid4())}"
95
104
  self.write_paths.append(write_path)
96
105
  if block:
@@ -99,24 +108,18 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
99
108
 
100
109
 
101
110
  class S3Url:
102
- def __init__(
103
- self,
104
- url: str):
111
+ def __init__(self, url: str):
105
112
 
106
113
  from urllib.parse import urlparse
107
114
 
108
- self._parsed = urlparse(
109
- url,
110
- allow_fragments=False # support '#' in path
111
- )
115
+ self._parsed = urlparse(url, allow_fragments=False) # support '#' in path
112
116
  if not self._parsed.scheme: # support paths w/o 's3://' scheme
113
117
  url = f"s3://{url}"
114
118
  self._parsed = urlparse(url, allow_fragments=False)
115
119
  if self._parsed.query: # support '?' in path
116
- self.key = \
117
- f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
120
+ self.key = f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
118
121
  else:
119
- self.key = self._parsed.path.lstrip('/')
122
+ self.key = self._parsed.path.lstrip("/")
120
123
  self.bucket = self._parsed.netloc
121
124
  self.url = self._parsed.geturl()
122
125
 
@@ -125,9 +128,7 @@ def parse_s3_url(url: str) -> S3Url:
125
128
  return S3Url(url)
126
129
 
127
130
 
128
- def s3_resource_cache(
129
- region: Optional[str],
130
- **kwargs) -> ServiceResource:
131
+ def s3_resource_cache(region: Optional[str], **kwargs) -> ServiceResource:
131
132
 
132
133
  return aws_utils.resource_cache(
133
134
  "s3",
@@ -136,36 +137,20 @@ def s3_resource_cache(
136
137
  )
137
138
 
138
139
 
139
- def s3_client_cache(
140
- region: Optional[str],
141
- **kwargs) -> BaseClient:
140
+ def s3_client_cache(region: Optional[str], **kwargs) -> BaseClient:
142
141
 
143
- return aws_utils.client_cache(
144
- "s3",
145
- region,
146
- **kwargs
147
- )
142
+ return aws_utils.client_cache("s3", region, **kwargs)
148
143
 
149
144
 
150
- def get_object_at_url(
151
- url: str,
152
- **s3_client_kwargs) -> Dict[str, Any]:
145
+ def get_object_at_url(url: str, **s3_client_kwargs) -> Dict[str, Any]:
153
146
 
154
- s3 = s3_client_cache(
155
- None,
156
- **s3_client_kwargs)
147
+ s3 = s3_client_cache(None, **s3_client_kwargs)
157
148
 
158
149
  parsed_s3_url = parse_s3_url(url)
159
- return s3.get_object(
160
- Bucket=parsed_s3_url.bucket,
161
- Key=parsed_s3_url.key
162
- )
150
+ return s3.get_object(Bucket=parsed_s3_url.bucket, Key=parsed_s3_url.key)
163
151
 
164
152
 
165
- def delete_files_by_prefix(
166
- bucket: str,
167
- prefix: str,
168
- **s3_client_kwargs) -> None:
153
+ def delete_files_by_prefix(bucket: str, prefix: str, **s3_client_kwargs) -> None:
169
154
 
170
155
  s3 = s3_resource_cache(None, **s3_client_kwargs)
171
156
  bucket = s3.Bucket(bucket)
@@ -189,14 +174,10 @@ def get_path_from_object(bucket, obj):
189
174
 
190
175
 
191
176
  def filter_objects_by_prefix(
192
- bucket: str,
193
- prefix: str,
194
- **s3_client_kwargs) -> Generator[Dict[str, Any], None, None]:
177
+ bucket: str, prefix: str, **s3_client_kwargs
178
+ ) -> Generator[Dict[str, Any], None, None]:
195
179
 
196
- s3 = s3_client_cache(
197
- None,
198
- **s3_client_kwargs
199
- )
180
+ s3 = s3_client_cache(None, **s3_client_kwargs)
200
181
  params = {"Bucket": bucket, "Prefix": prefix}
201
182
  more_objects_to_list = True
202
183
  while more_objects_to_list:
@@ -209,14 +190,15 @@ def filter_objects_by_prefix(
209
190
 
210
191
 
211
192
  def read_file(
212
- s3_url: str,
213
- content_type: ContentType,
214
- content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
215
- table_type: TableType = TableType.PYARROW,
216
- column_names: Optional[List[str]] = None,
217
- include_columns: Optional[List[str]] = None,
218
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
219
- **s3_client_kwargs) -> LocalTable:
193
+ s3_url: str,
194
+ content_type: ContentType,
195
+ content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
196
+ table_type: TableType = TableType.PYARROW,
197
+ column_names: Optional[List[str]] = None,
198
+ include_columns: Optional[List[str]] = None,
199
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
200
+ **s3_client_kwargs,
201
+ ) -> LocalTable:
220
202
 
221
203
  reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
222
204
  try:
@@ -227,34 +209,33 @@ def read_file(
227
209
  column_names,
228
210
  include_columns,
229
211
  file_reader_kwargs_provider,
230
- **s3_client_kwargs
212
+ **s3_client_kwargs,
231
213
  )
232
214
  return table
233
215
  except ClientError as e:
234
216
  if e.response["Error"]["Code"] in TIMEOUT_ERROR_CODES:
235
217
  # Timeout error not caught by botocore
236
- raise RetryableError(f"Retry table download from: {s3_url}") \
237
- from e
238
- raise NonRetryableError(f"Failed table download from: {s3_url}") \
239
- from e
218
+ raise RetryableError(f"Retry table download from: {s3_url}") from e
219
+ raise NonRetryableError(f"Failed table download from: {s3_url}") from e
240
220
 
241
221
 
242
222
  def upload_sliced_table(
243
- table: Union[LocalTable, DistributedDataset],
244
- s3_url_prefix: str,
245
- s3_file_system: s3fs.S3FileSystem,
246
- max_records_per_entry: Optional[int],
247
- s3_table_writer_func: Callable,
248
- table_slicer_func: Callable,
249
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
250
- content_type: ContentType = ContentType.PARQUET,
251
- **s3_client_kwargs) -> ManifestEntryList:
223
+ table: Union[LocalTable, DistributedDataset],
224
+ s3_url_prefix: str,
225
+ s3_file_system: s3fs.S3FileSystem,
226
+ max_records_per_entry: Optional[int],
227
+ s3_table_writer_func: Callable,
228
+ table_slicer_func: Callable,
229
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
230
+ content_type: ContentType = ContentType.PARQUET,
231
+ **s3_client_kwargs,
232
+ ) -> ManifestEntryList:
252
233
 
253
234
  # @retry decorator can't be pickled by Ray, so wrap upload in Retrying
254
235
  retrying = Retrying(
255
236
  wait=wait_random_exponential(multiplier=1, max=60),
256
237
  stop=stop_after_delay(30 * 60),
257
- retry=retry_if_exception_type(RetryableError)
238
+ retry=retry_if_exception_type(RetryableError),
258
239
  )
259
240
 
260
241
  manifest_entries = ManifestEntryList()
@@ -270,14 +251,11 @@ def upload_sliced_table(
270
251
  s3_table_writer_func,
271
252
  s3_table_writer_kwargs,
272
253
  content_type,
273
- **s3_client_kwargs
254
+ **s3_client_kwargs,
274
255
  )
275
256
  else:
276
257
  # iteratively write table slices
277
- table_slices = table_slicer_func(
278
- table,
279
- max_records_per_entry
280
- )
258
+ table_slices = table_slicer_func(table, max_records_per_entry)
281
259
  for table_slice in table_slices:
282
260
  slice_entries = retrying(
283
261
  upload_table,
@@ -287,7 +265,7 @@ def upload_sliced_table(
287
265
  s3_table_writer_func,
288
266
  s3_table_writer_kwargs,
289
267
  content_type,
290
- **s3_client_kwargs
268
+ **s3_client_kwargs,
291
269
  )
292
270
  manifest_entries.extend(slice_entries)
293
271
 
@@ -303,15 +281,17 @@ def _block_metadata(block: Block) -> BlockMetadata:
303
281
 
304
282
 
305
283
  def _get_metadata(
306
- table: Union[LocalTable, DistributedDataset],
307
- write_paths: List[str],
308
- block_refs: List[ObjectRef[Block]])-> List[BlockMetadata]:
284
+ table: Union[LocalTable, DistributedDataset],
285
+ write_paths: List[str],
286
+ block_refs: List[ObjectRef[Block]],
287
+ ) -> List[BlockMetadata]:
309
288
  metadata: List[BlockMetadata] = []
310
289
  if not block_refs:
311
290
  # this must be a local table - ensure it was written to only 1 file
312
- assert len(write_paths) == 1, \
313
- f"Expected table of type '{type(table)}' to be written to 1 " \
291
+ assert len(write_paths) == 1, (
292
+ f"Expected table of type '{type(table)}' to be written to 1 "
314
293
  f"file, but found {len(write_paths)} files."
294
+ )
315
295
  table_size = None
316
296
  table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
317
297
  if table_size_func:
@@ -333,23 +313,27 @@ def _get_metadata(
333
313
  # metadata = dataset._blocks.get_metadata()
334
314
  # ray 2.0.0dev
335
315
  metadata = table._plan.execute().get_metadata()
336
- if not metadata or metadata[0].size_bytes is None or \
337
- metadata[0].num_rows is None:
338
- metadata_futures = [_block_metadata.remote(block_ref)
339
- for block_ref
340
- in block_refs]
316
+ if (
317
+ not metadata
318
+ or metadata[0].size_bytes is None
319
+ or metadata[0].num_rows is None
320
+ ):
321
+ metadata_futures = [
322
+ _block_metadata.remote(block_ref) for block_ref in block_refs
323
+ ]
341
324
  metadata = ray.get(metadata_futures)
342
325
  return metadata
343
326
 
344
327
 
345
328
  def upload_table(
346
- table: Union[LocalTable, DistributedDataset],
347
- s3_base_url: str,
348
- s3_file_system: s3fs.S3FileSystem,
349
- s3_table_writer_func: Callable,
350
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
351
- content_type: ContentType = ContentType.PARQUET,
352
- **s3_client_kwargs) -> ManifestEntryList:
329
+ table: Union[LocalTable, DistributedDataset],
330
+ s3_base_url: str,
331
+ s3_file_system: s3fs.S3FileSystem,
332
+ s3_table_writer_func: Callable,
333
+ s3_table_writer_kwargs: Optional[Dict[str, Any]],
334
+ content_type: ContentType = ContentType.PARQUET,
335
+ **s3_client_kwargs,
336
+ ) -> ManifestEntryList:
353
337
  """
354
338
  Writes the given table to 1 or more S3 files and return Redshift
355
339
  manifest entries describing the uploaded files.
@@ -365,7 +349,7 @@ def upload_table(
365
349
  s3_file_system,
366
350
  block_write_path_provider,
367
351
  content_type.value,
368
- **s3_table_writer_kwargs
352
+ **s3_table_writer_kwargs,
369
353
  )
370
354
  # TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
371
355
  del block_write_path_provider
@@ -385,37 +369,42 @@ def upload_table(
385
369
  except ClientError as e:
386
370
  if e.response["Error"]["Code"] == "NoSuchKey":
387
371
  # s3fs may swallow S3 errors - we were probably throttled
388
- raise RetryableError(f"Retry table upload to: {s3_url}") \
389
- from e
390
- raise NonRetryableError(f"Failed table upload to: {s3_url}") \
391
- from e
372
+ raise RetryableError(f"Retry table upload to: {s3_url}") from e
373
+ raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
392
374
  return manifest_entries
393
375
 
394
376
 
395
377
  def download_manifest_entry(
396
- manifest_entry: ManifestEntry,
397
- token_holder: Optional[Dict[str, Any]] = None,
398
- table_type: TableType = TableType.PYARROW,
399
- column_names: Optional[List[str]] = None,
400
- include_columns: Optional[List[str]] = None,
401
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
402
- content_type: Optional[ContentType] = None,
403
- content_encoding: Optional[ContentEncoding] = None) -> LocalTable:
404
-
405
- s3_client_kwargs = {
406
- "aws_access_key_id": token_holder["accessKeyId"],
407
- "aws_secret_access_key": token_holder["secretAccessKey"],
408
- "aws_session_token": token_holder["sessionToken"]
409
- } if token_holder else {}
378
+ manifest_entry: ManifestEntry,
379
+ token_holder: Optional[Dict[str, Any]] = None,
380
+ table_type: TableType = TableType.PYARROW,
381
+ column_names: Optional[List[str]] = None,
382
+ include_columns: Optional[List[str]] = None,
383
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
384
+ content_type: Optional[ContentType] = None,
385
+ content_encoding: Optional[ContentEncoding] = None,
386
+ ) -> LocalTable:
387
+
388
+ s3_client_kwargs = (
389
+ {
390
+ "aws_access_key_id": token_holder["accessKeyId"],
391
+ "aws_secret_access_key": token_holder["secretAccessKey"],
392
+ "aws_session_token": token_holder["sessionToken"],
393
+ }
394
+ if token_holder
395
+ else {}
396
+ )
410
397
  if not content_type:
411
398
  content_type = manifest_entry.meta.content_type
412
- assert content_type, \
413
- f"Unknown content type for manifest entry: {manifest_entry}"
399
+ assert (
400
+ content_type
401
+ ), f"Unknown content type for manifest entry: {manifest_entry}"
414
402
  content_type = ContentType(content_type)
415
403
  if not content_encoding:
416
404
  content_encoding = manifest_entry.meta.content_encoding
417
- assert content_encoding, \
418
- f"Unknown content encoding for manifest entry: {manifest_entry}"
405
+ assert (
406
+ content_encoding
407
+ ), f"Unknown content encoding for manifest entry: {manifest_entry}"
419
408
  content_encoding = ContentEncoding(content_encoding)
420
409
  s3_url = manifest_entry.uri
421
410
  if s3_url is None:
@@ -424,7 +413,7 @@ def download_manifest_entry(
424
413
  retrying = Retrying(
425
414
  wait=wait_random_exponential(multiplier=1, max=60),
426
415
  stop=stop_after_delay(30 * 60),
427
- retry=retry_if_not_exception_type(NonRetryableError)
416
+ retry=retry_if_not_exception_type(NonRetryableError),
428
417
  )
429
418
  table = retrying(
430
419
  read_file,
@@ -441,30 +430,36 @@ def download_manifest_entry(
441
430
 
442
431
 
443
432
  def _download_manifest_entries(
444
- manifest: Manifest,
445
- token_holder: Optional[Dict[str, Any]] = None,
446
- table_type: TableType = TableType.PYARROW,
447
- column_names: Optional[List[str]] = None,
448
- include_columns: Optional[List[str]] = None,
449
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
450
- -> LocalDataset:
433
+ manifest: Manifest,
434
+ token_holder: Optional[Dict[str, Any]] = None,
435
+ table_type: TableType = TableType.PYARROW,
436
+ column_names: Optional[List[str]] = None,
437
+ include_columns: Optional[List[str]] = None,
438
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
439
+ ) -> LocalDataset:
451
440
 
452
441
  return [
453
- download_manifest_entry(e, token_holder, table_type, column_names,
454
- include_columns, file_reader_kwargs_provider)
442
+ download_manifest_entry(
443
+ e,
444
+ token_holder,
445
+ table_type,
446
+ column_names,
447
+ include_columns,
448
+ file_reader_kwargs_provider,
449
+ )
455
450
  for e in manifest.entries
456
451
  ]
457
452
 
458
453
 
459
454
  def _download_manifest_entries_parallel(
460
- manifest: Manifest,
461
- token_holder: Optional[Dict[str, Any]] = None,
462
- table_type: TableType = TableType.PYARROW,
463
- max_parallelism: Optional[int] = None,
464
- column_names: Optional[List[str]] = None,
465
- include_columns: Optional[List[str]] = None,
466
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
467
- -> LocalDataset:
455
+ manifest: Manifest,
456
+ token_holder: Optional[Dict[str, Any]] = None,
457
+ table_type: TableType = TableType.PYARROW,
458
+ max_parallelism: Optional[int] = None,
459
+ column_names: Optional[List[str]] = None,
460
+ include_columns: Optional[List[str]] = None,
461
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
462
+ ) -> LocalDataset:
468
463
 
469
464
  tables = []
470
465
  pool = multiprocessing.Pool(max_parallelism)
@@ -482,14 +477,14 @@ def _download_manifest_entries_parallel(
482
477
 
483
478
 
484
479
  def download_manifest_entries(
485
- manifest: Manifest,
486
- token_holder: Optional[Dict[str, Any]] = None,
487
- table_type: TableType = TableType.PYARROW,
488
- max_parallelism: Optional[int] = 1,
489
- column_names: Optional[List[str]] = None,
490
- include_columns: Optional[List[str]] = None,
491
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
492
- -> LocalDataset:
480
+ manifest: Manifest,
481
+ token_holder: Optional[Dict[str, Any]] = None,
482
+ table_type: TableType = TableType.PYARROW,
483
+ max_parallelism: Optional[int] = 1,
484
+ column_names: Optional[List[str]] = None,
485
+ include_columns: Optional[List[str]] = None,
486
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
487
+ ) -> LocalDataset:
493
488
 
494
489
  if max_parallelism and max_parallelism <= 1:
495
490
  return _download_manifest_entries(
@@ -512,10 +507,7 @@ def download_manifest_entries(
512
507
  )
513
508
 
514
509
 
515
- def upload(
516
- s3_url: str,
517
- body,
518
- **s3_client_kwargs) -> Dict[str, Any]:
510
+ def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
519
511
 
520
512
  # TODO (pdames): add tenacity retrying
521
513
  parsed_s3_url = parse_s3_url(s3_url)
@@ -528,9 +520,8 @@ def upload(
528
520
 
529
521
 
530
522
  def download(
531
- s3_url: str,
532
- fail_if_not_found: bool = True,
533
- **s3_client_kwargs) -> Optional[Dict[str, Any]]:
523
+ s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
524
+ ) -> Optional[Dict[str, Any]]:
534
525
 
535
526
  # TODO (pdames): add tenacity retrying
536
527
  parsed_s3_url = parse_s3_url(s3_url)
@@ -544,15 +535,13 @@ def download(
544
535
  if fail_if_not_found:
545
536
  raise
546
537
  else:
547
- if e.response['Error']['Code'] != "404":
548
- if e.response['Error']['Code'] != 'NoSuchKey':
538
+ if e.response["Error"]["Code"] != "404":
539
+ if e.response["Error"]["Code"] != "NoSuchKey":
549
540
  raise
550
- logger.info(
551
- f"file not found: {s3_url}")
541
+ logger.info(f"file not found: {s3_url}")
552
542
  except s3.exceptions.NoSuchKey:
553
543
  if fail_if_not_found:
554
544
  raise
555
545
  else:
556
- logger.info(
557
- f"file not found: {s3_url}")
546
+ logger.info(f"file not found: {s3_url}")
558
547
  return None