deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/aws/s3u.py
CHANGED
@@ -1,51 +1,58 @@
|
|
1
|
-
import ray
|
2
|
-
import deltacat.aws.clients as aws_utils
|
3
1
|
import logging
|
4
2
|
import multiprocessing
|
5
|
-
import s3fs
|
6
|
-
import pyarrow as pa
|
7
|
-
|
8
3
|
from functools import partial
|
4
|
+
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
9
5
|
from uuid import uuid4
|
10
6
|
|
11
|
-
|
12
|
-
|
7
|
+
import pyarrow as pa
|
8
|
+
import ray
|
9
|
+
import s3fs
|
10
|
+
from boto3.resources.base import ServiceResource
|
11
|
+
from botocore.client import BaseClient
|
12
|
+
from botocore.exceptions import ClientError
|
13
13
|
from ray.data.block import Block, BlockAccessor, BlockMetadata
|
14
|
+
from ray.data.datasource import BlockWritePathProvider
|
15
|
+
from ray.types import ObjectRef
|
16
|
+
from tenacity import (
|
17
|
+
Retrying,
|
18
|
+
retry_if_exception_type,
|
19
|
+
retry_if_not_exception_type,
|
20
|
+
stop_after_delay,
|
21
|
+
wait_random_exponential,
|
22
|
+
)
|
14
23
|
|
24
|
+
import deltacat.aws.clients as aws_utils
|
15
25
|
from deltacat import logs
|
16
|
-
from deltacat.storage import LocalTable, LocalDataset, DistributedDataset, \
|
17
|
-
Manifest, ManifestEntry, ManifestEntryList
|
18
26
|
from deltacat.aws.constants import TIMEOUT_ERROR_CODES
|
19
|
-
from deltacat.exceptions import
|
20
|
-
from deltacat.
|
21
|
-
|
22
|
-
|
23
|
-
|
27
|
+
from deltacat.exceptions import NonRetryableError, RetryableError
|
28
|
+
from deltacat.storage import (
|
29
|
+
DistributedDataset,
|
30
|
+
LocalDataset,
|
31
|
+
LocalTable,
|
32
|
+
Manifest,
|
33
|
+
ManifestEntry,
|
34
|
+
ManifestEntryList,
|
35
|
+
)
|
36
|
+
from deltacat.types.media import ContentEncoding, ContentType, TableType
|
37
|
+
from deltacat.types.tables import (
|
38
|
+
TABLE_CLASS_TO_SIZE_FUNC,
|
39
|
+
TABLE_TYPE_TO_READER_FUNC,
|
40
|
+
get_table_length,
|
41
|
+
)
|
24
42
|
from deltacat.utils.common import ReadKwargsProvider
|
25
43
|
|
26
|
-
from boto3.resources.base import ServiceResource
|
27
|
-
from botocore.client import BaseClient
|
28
|
-
from botocore.exceptions import ClientError
|
29
|
-
from tenacity import Retrying
|
30
|
-
from tenacity import wait_random_exponential
|
31
|
-
from tenacity import stop_after_delay
|
32
|
-
from tenacity import retry_if_exception_type, retry_if_not_exception_type
|
33
|
-
|
34
|
-
from typing import Any, Callable, Dict, List, Optional, Generator, Union
|
35
|
-
|
36
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
37
45
|
|
38
|
-
# TODO(raghumdani): refactor redshift datasource to reuse the
|
46
|
+
# TODO(raghumdani): refactor redshift datasource to reuse the
|
39
47
|
# same module for writing output files.
|
48
|
+
|
49
|
+
|
40
50
|
class CapturedBlockWritePaths:
|
41
51
|
def __init__(self):
|
42
52
|
self._write_paths: List[str] = []
|
43
53
|
self._block_refs: List[ObjectRef[Block]] = []
|
44
54
|
|
45
|
-
def extend(
|
46
|
-
self,
|
47
|
-
write_paths: List[str],
|
48
|
-
block_refs: List[ObjectRef[Block]]):
|
55
|
+
def extend(self, write_paths: List[str], block_refs: List[ObjectRef[Block]]):
|
49
56
|
try:
|
50
57
|
iter(write_paths)
|
51
58
|
except TypeError:
|
@@ -70,6 +77,7 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
70
77
|
"""Block write path provider implementation that writes each
|
71
78
|
dataset block out to a file of the form: {base_path}/{uuid}
|
72
79
|
"""
|
80
|
+
|
73
81
|
def __init__(self, capture_object: CapturedBlockWritePaths):
|
74
82
|
self.write_paths: List[str] = []
|
75
83
|
self.block_refs: List[ObjectRef[Block]] = []
|
@@ -83,14 +91,15 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
83
91
|
)
|
84
92
|
|
85
93
|
def _get_write_path_for_block(
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
+
self,
|
95
|
+
base_path: str,
|
96
|
+
*,
|
97
|
+
filesystem: Optional[pa.filesystem.FileSystem] = None,
|
98
|
+
dataset_uuid: Optional[str] = None,
|
99
|
+
block: Optional[ObjectRef[Block]] = None,
|
100
|
+
block_index: Optional[int] = None,
|
101
|
+
file_format: Optional[str] = None,
|
102
|
+
) -> str:
|
94
103
|
write_path = f"{base_path}/{str(uuid4())}"
|
95
104
|
self.write_paths.append(write_path)
|
96
105
|
if block:
|
@@ -99,24 +108,18 @@ class UuidBlockWritePathProvider(BlockWritePathProvider):
|
|
99
108
|
|
100
109
|
|
101
110
|
class S3Url:
|
102
|
-
def __init__(
|
103
|
-
self,
|
104
|
-
url: str):
|
111
|
+
def __init__(self, url: str):
|
105
112
|
|
106
113
|
from urllib.parse import urlparse
|
107
114
|
|
108
|
-
self._parsed = urlparse(
|
109
|
-
url,
|
110
|
-
allow_fragments=False # support '#' in path
|
111
|
-
)
|
115
|
+
self._parsed = urlparse(url, allow_fragments=False) # support '#' in path
|
112
116
|
if not self._parsed.scheme: # support paths w/o 's3://' scheme
|
113
117
|
url = f"s3://{url}"
|
114
118
|
self._parsed = urlparse(url, allow_fragments=False)
|
115
119
|
if self._parsed.query: # support '?' in path
|
116
|
-
self.key =
|
117
|
-
f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
|
120
|
+
self.key = f"{self._parsed.path.lstrip('/')}?{self._parsed.query}"
|
118
121
|
else:
|
119
|
-
self.key = self._parsed.path.lstrip(
|
122
|
+
self.key = self._parsed.path.lstrip("/")
|
120
123
|
self.bucket = self._parsed.netloc
|
121
124
|
self.url = self._parsed.geturl()
|
122
125
|
|
@@ -125,9 +128,7 @@ def parse_s3_url(url: str) -> S3Url:
|
|
125
128
|
return S3Url(url)
|
126
129
|
|
127
130
|
|
128
|
-
def s3_resource_cache(
|
129
|
-
region: Optional[str],
|
130
|
-
**kwargs) -> ServiceResource:
|
131
|
+
def s3_resource_cache(region: Optional[str], **kwargs) -> ServiceResource:
|
131
132
|
|
132
133
|
return aws_utils.resource_cache(
|
133
134
|
"s3",
|
@@ -136,36 +137,20 @@ def s3_resource_cache(
|
|
136
137
|
)
|
137
138
|
|
138
139
|
|
139
|
-
def s3_client_cache(
|
140
|
-
region: Optional[str],
|
141
|
-
**kwargs) -> BaseClient:
|
140
|
+
def s3_client_cache(region: Optional[str], **kwargs) -> BaseClient:
|
142
141
|
|
143
|
-
return aws_utils.client_cache(
|
144
|
-
"s3",
|
145
|
-
region,
|
146
|
-
**kwargs
|
147
|
-
)
|
142
|
+
return aws_utils.client_cache("s3", region, **kwargs)
|
148
143
|
|
149
144
|
|
150
|
-
def get_object_at_url(
|
151
|
-
url: str,
|
152
|
-
**s3_client_kwargs) -> Dict[str, Any]:
|
145
|
+
def get_object_at_url(url: str, **s3_client_kwargs) -> Dict[str, Any]:
|
153
146
|
|
154
|
-
s3 = s3_client_cache(
|
155
|
-
None,
|
156
|
-
**s3_client_kwargs)
|
147
|
+
s3 = s3_client_cache(None, **s3_client_kwargs)
|
157
148
|
|
158
149
|
parsed_s3_url = parse_s3_url(url)
|
159
|
-
return s3.get_object(
|
160
|
-
Bucket=parsed_s3_url.bucket,
|
161
|
-
Key=parsed_s3_url.key
|
162
|
-
)
|
150
|
+
return s3.get_object(Bucket=parsed_s3_url.bucket, Key=parsed_s3_url.key)
|
163
151
|
|
164
152
|
|
165
|
-
def delete_files_by_prefix(
|
166
|
-
bucket: str,
|
167
|
-
prefix: str,
|
168
|
-
**s3_client_kwargs) -> None:
|
153
|
+
def delete_files_by_prefix(bucket: str, prefix: str, **s3_client_kwargs) -> None:
|
169
154
|
|
170
155
|
s3 = s3_resource_cache(None, **s3_client_kwargs)
|
171
156
|
bucket = s3.Bucket(bucket)
|
@@ -189,14 +174,10 @@ def get_path_from_object(bucket, obj):
|
|
189
174
|
|
190
175
|
|
191
176
|
def filter_objects_by_prefix(
|
192
|
-
|
193
|
-
|
194
|
-
**s3_client_kwargs) -> Generator[Dict[str, Any], None, None]:
|
177
|
+
bucket: str, prefix: str, **s3_client_kwargs
|
178
|
+
) -> Generator[Dict[str, Any], None, None]:
|
195
179
|
|
196
|
-
s3 = s3_client_cache(
|
197
|
-
None,
|
198
|
-
**s3_client_kwargs
|
199
|
-
)
|
180
|
+
s3 = s3_client_cache(None, **s3_client_kwargs)
|
200
181
|
params = {"Bucket": bucket, "Prefix": prefix}
|
201
182
|
more_objects_to_list = True
|
202
183
|
while more_objects_to_list:
|
@@ -209,14 +190,15 @@ def filter_objects_by_prefix(
|
|
209
190
|
|
210
191
|
|
211
192
|
def read_file(
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
193
|
+
s3_url: str,
|
194
|
+
content_type: ContentType,
|
195
|
+
content_encoding: ContentEncoding = ContentEncoding.IDENTITY,
|
196
|
+
table_type: TableType = TableType.PYARROW,
|
197
|
+
column_names: Optional[List[str]] = None,
|
198
|
+
include_columns: Optional[List[str]] = None,
|
199
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
200
|
+
**s3_client_kwargs,
|
201
|
+
) -> LocalTable:
|
220
202
|
|
221
203
|
reader = TABLE_TYPE_TO_READER_FUNC[table_type.value]
|
222
204
|
try:
|
@@ -227,34 +209,33 @@ def read_file(
|
|
227
209
|
column_names,
|
228
210
|
include_columns,
|
229
211
|
file_reader_kwargs_provider,
|
230
|
-
**s3_client_kwargs
|
212
|
+
**s3_client_kwargs,
|
231
213
|
)
|
232
214
|
return table
|
233
215
|
except ClientError as e:
|
234
216
|
if e.response["Error"]["Code"] in TIMEOUT_ERROR_CODES:
|
235
217
|
# Timeout error not caught by botocore
|
236
|
-
raise RetryableError(f"Retry table download from: {s3_url}")
|
237
|
-
|
238
|
-
raise NonRetryableError(f"Failed table download from: {s3_url}") \
|
239
|
-
from e
|
218
|
+
raise RetryableError(f"Retry table download from: {s3_url}") from e
|
219
|
+
raise NonRetryableError(f"Failed table download from: {s3_url}") from e
|
240
220
|
|
241
221
|
|
242
222
|
def upload_sliced_table(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
223
|
+
table: Union[LocalTable, DistributedDataset],
|
224
|
+
s3_url_prefix: str,
|
225
|
+
s3_file_system: s3fs.S3FileSystem,
|
226
|
+
max_records_per_entry: Optional[int],
|
227
|
+
s3_table_writer_func: Callable,
|
228
|
+
table_slicer_func: Callable,
|
229
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
230
|
+
content_type: ContentType = ContentType.PARQUET,
|
231
|
+
**s3_client_kwargs,
|
232
|
+
) -> ManifestEntryList:
|
252
233
|
|
253
234
|
# @retry decorator can't be pickled by Ray, so wrap upload in Retrying
|
254
235
|
retrying = Retrying(
|
255
236
|
wait=wait_random_exponential(multiplier=1, max=60),
|
256
237
|
stop=stop_after_delay(30 * 60),
|
257
|
-
retry=retry_if_exception_type(RetryableError)
|
238
|
+
retry=retry_if_exception_type(RetryableError),
|
258
239
|
)
|
259
240
|
|
260
241
|
manifest_entries = ManifestEntryList()
|
@@ -270,14 +251,11 @@ def upload_sliced_table(
|
|
270
251
|
s3_table_writer_func,
|
271
252
|
s3_table_writer_kwargs,
|
272
253
|
content_type,
|
273
|
-
**s3_client_kwargs
|
254
|
+
**s3_client_kwargs,
|
274
255
|
)
|
275
256
|
else:
|
276
257
|
# iteratively write table slices
|
277
|
-
table_slices = table_slicer_func(
|
278
|
-
table,
|
279
|
-
max_records_per_entry
|
280
|
-
)
|
258
|
+
table_slices = table_slicer_func(table, max_records_per_entry)
|
281
259
|
for table_slice in table_slices:
|
282
260
|
slice_entries = retrying(
|
283
261
|
upload_table,
|
@@ -287,7 +265,7 @@ def upload_sliced_table(
|
|
287
265
|
s3_table_writer_func,
|
288
266
|
s3_table_writer_kwargs,
|
289
267
|
content_type,
|
290
|
-
**s3_client_kwargs
|
268
|
+
**s3_client_kwargs,
|
291
269
|
)
|
292
270
|
manifest_entries.extend(slice_entries)
|
293
271
|
|
@@ -303,15 +281,17 @@ def _block_metadata(block: Block) -> BlockMetadata:
|
|
303
281
|
|
304
282
|
|
305
283
|
def _get_metadata(
|
306
|
-
|
307
|
-
|
308
|
-
|
284
|
+
table: Union[LocalTable, DistributedDataset],
|
285
|
+
write_paths: List[str],
|
286
|
+
block_refs: List[ObjectRef[Block]],
|
287
|
+
) -> List[BlockMetadata]:
|
309
288
|
metadata: List[BlockMetadata] = []
|
310
289
|
if not block_refs:
|
311
290
|
# this must be a local table - ensure it was written to only 1 file
|
312
|
-
assert len(write_paths) == 1,
|
313
|
-
f"Expected table of type '{type(table)}' to be written to 1 "
|
291
|
+
assert len(write_paths) == 1, (
|
292
|
+
f"Expected table of type '{type(table)}' to be written to 1 "
|
314
293
|
f"file, but found {len(write_paths)} files."
|
294
|
+
)
|
315
295
|
table_size = None
|
316
296
|
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
317
297
|
if table_size_func:
|
@@ -333,23 +313,27 @@ def _get_metadata(
|
|
333
313
|
# metadata = dataset._blocks.get_metadata()
|
334
314
|
# ray 2.0.0dev
|
335
315
|
metadata = table._plan.execute().get_metadata()
|
336
|
-
if
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
316
|
+
if (
|
317
|
+
not metadata
|
318
|
+
or metadata[0].size_bytes is None
|
319
|
+
or metadata[0].num_rows is None
|
320
|
+
):
|
321
|
+
metadata_futures = [
|
322
|
+
_block_metadata.remote(block_ref) for block_ref in block_refs
|
323
|
+
]
|
341
324
|
metadata = ray.get(metadata_futures)
|
342
325
|
return metadata
|
343
326
|
|
344
327
|
|
345
328
|
def upload_table(
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
329
|
+
table: Union[LocalTable, DistributedDataset],
|
330
|
+
s3_base_url: str,
|
331
|
+
s3_file_system: s3fs.S3FileSystem,
|
332
|
+
s3_table_writer_func: Callable,
|
333
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
334
|
+
content_type: ContentType = ContentType.PARQUET,
|
335
|
+
**s3_client_kwargs,
|
336
|
+
) -> ManifestEntryList:
|
353
337
|
"""
|
354
338
|
Writes the given table to 1 or more S3 files and return Redshift
|
355
339
|
manifest entries describing the uploaded files.
|
@@ -365,7 +349,7 @@ def upload_table(
|
|
365
349
|
s3_file_system,
|
366
350
|
block_write_path_provider,
|
367
351
|
content_type.value,
|
368
|
-
**s3_table_writer_kwargs
|
352
|
+
**s3_table_writer_kwargs,
|
369
353
|
)
|
370
354
|
# TODO: Add a proper fix for block_refs and write_paths not persisting in Ray actors
|
371
355
|
del block_write_path_provider
|
@@ -385,37 +369,42 @@ def upload_table(
|
|
385
369
|
except ClientError as e:
|
386
370
|
if e.response["Error"]["Code"] == "NoSuchKey":
|
387
371
|
# s3fs may swallow S3 errors - we were probably throttled
|
388
|
-
raise RetryableError(f"Retry table upload to: {s3_url}")
|
389
|
-
|
390
|
-
raise NonRetryableError(f"Failed table upload to: {s3_url}") \
|
391
|
-
from e
|
372
|
+
raise RetryableError(f"Retry table upload to: {s3_url}") from e
|
373
|
+
raise NonRetryableError(f"Failed table upload to: {s3_url}") from e
|
392
374
|
return manifest_entries
|
393
375
|
|
394
376
|
|
395
377
|
def download_manifest_entry(
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
378
|
+
manifest_entry: ManifestEntry,
|
379
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
380
|
+
table_type: TableType = TableType.PYARROW,
|
381
|
+
column_names: Optional[List[str]] = None,
|
382
|
+
include_columns: Optional[List[str]] = None,
|
383
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
384
|
+
content_type: Optional[ContentType] = None,
|
385
|
+
content_encoding: Optional[ContentEncoding] = None,
|
386
|
+
) -> LocalTable:
|
387
|
+
|
388
|
+
s3_client_kwargs = (
|
389
|
+
{
|
390
|
+
"aws_access_key_id": token_holder["accessKeyId"],
|
391
|
+
"aws_secret_access_key": token_holder["secretAccessKey"],
|
392
|
+
"aws_session_token": token_holder["sessionToken"],
|
393
|
+
}
|
394
|
+
if token_holder
|
395
|
+
else {}
|
396
|
+
)
|
410
397
|
if not content_type:
|
411
398
|
content_type = manifest_entry.meta.content_type
|
412
|
-
assert
|
413
|
-
|
399
|
+
assert (
|
400
|
+
content_type
|
401
|
+
), f"Unknown content type for manifest entry: {manifest_entry}"
|
414
402
|
content_type = ContentType(content_type)
|
415
403
|
if not content_encoding:
|
416
404
|
content_encoding = manifest_entry.meta.content_encoding
|
417
|
-
assert
|
418
|
-
|
405
|
+
assert (
|
406
|
+
content_encoding
|
407
|
+
), f"Unknown content encoding for manifest entry: {manifest_entry}"
|
419
408
|
content_encoding = ContentEncoding(content_encoding)
|
420
409
|
s3_url = manifest_entry.uri
|
421
410
|
if s3_url is None:
|
@@ -424,7 +413,7 @@ def download_manifest_entry(
|
|
424
413
|
retrying = Retrying(
|
425
414
|
wait=wait_random_exponential(multiplier=1, max=60),
|
426
415
|
stop=stop_after_delay(30 * 60),
|
427
|
-
retry=retry_if_not_exception_type(NonRetryableError)
|
416
|
+
retry=retry_if_not_exception_type(NonRetryableError),
|
428
417
|
)
|
429
418
|
table = retrying(
|
430
419
|
read_file,
|
@@ -441,30 +430,36 @@ def download_manifest_entry(
|
|
441
430
|
|
442
431
|
|
443
432
|
def _download_manifest_entries(
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
433
|
+
manifest: Manifest,
|
434
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
435
|
+
table_type: TableType = TableType.PYARROW,
|
436
|
+
column_names: Optional[List[str]] = None,
|
437
|
+
include_columns: Optional[List[str]] = None,
|
438
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
439
|
+
) -> LocalDataset:
|
451
440
|
|
452
441
|
return [
|
453
|
-
download_manifest_entry(
|
454
|
-
|
442
|
+
download_manifest_entry(
|
443
|
+
e,
|
444
|
+
token_holder,
|
445
|
+
table_type,
|
446
|
+
column_names,
|
447
|
+
include_columns,
|
448
|
+
file_reader_kwargs_provider,
|
449
|
+
)
|
455
450
|
for e in manifest.entries
|
456
451
|
]
|
457
452
|
|
458
453
|
|
459
454
|
def _download_manifest_entries_parallel(
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
455
|
+
manifest: Manifest,
|
456
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
457
|
+
table_type: TableType = TableType.PYARROW,
|
458
|
+
max_parallelism: Optional[int] = None,
|
459
|
+
column_names: Optional[List[str]] = None,
|
460
|
+
include_columns: Optional[List[str]] = None,
|
461
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
462
|
+
) -> LocalDataset:
|
468
463
|
|
469
464
|
tables = []
|
470
465
|
pool = multiprocessing.Pool(max_parallelism)
|
@@ -482,14 +477,14 @@ def _download_manifest_entries_parallel(
|
|
482
477
|
|
483
478
|
|
484
479
|
def download_manifest_entries(
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
480
|
+
manifest: Manifest,
|
481
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
482
|
+
table_type: TableType = TableType.PYARROW,
|
483
|
+
max_parallelism: Optional[int] = 1,
|
484
|
+
column_names: Optional[List[str]] = None,
|
485
|
+
include_columns: Optional[List[str]] = None,
|
486
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
487
|
+
) -> LocalDataset:
|
493
488
|
|
494
489
|
if max_parallelism and max_parallelism <= 1:
|
495
490
|
return _download_manifest_entries(
|
@@ -512,10 +507,7 @@ def download_manifest_entries(
|
|
512
507
|
)
|
513
508
|
|
514
509
|
|
515
|
-
def upload(
|
516
|
-
s3_url: str,
|
517
|
-
body,
|
518
|
-
**s3_client_kwargs) -> Dict[str, Any]:
|
510
|
+
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
519
511
|
|
520
512
|
# TODO (pdames): add tenacity retrying
|
521
513
|
parsed_s3_url = parse_s3_url(s3_url)
|
@@ -528,9 +520,8 @@ def upload(
|
|
528
520
|
|
529
521
|
|
530
522
|
def download(
|
531
|
-
|
532
|
-
|
533
|
-
**s3_client_kwargs) -> Optional[Dict[str, Any]]:
|
523
|
+
s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
|
524
|
+
) -> Optional[Dict[str, Any]]:
|
534
525
|
|
535
526
|
# TODO (pdames): add tenacity retrying
|
536
527
|
parsed_s3_url = parse_s3_url(s3_url)
|
@@ -544,15 +535,13 @@ def download(
|
|
544
535
|
if fail_if_not_found:
|
545
536
|
raise
|
546
537
|
else:
|
547
|
-
if e.response[
|
548
|
-
if e.response[
|
538
|
+
if e.response["Error"]["Code"] != "404":
|
539
|
+
if e.response["Error"]["Code"] != "NoSuchKey":
|
549
540
|
raise
|
550
|
-
logger.info(
|
551
|
-
f"file not found: {s3_url}")
|
541
|
+
logger.info(f"file not found: {s3_url}")
|
552
542
|
except s3.exceptions.NoSuchKey:
|
553
543
|
if fail_if_not_found:
|
554
544
|
raise
|
555
545
|
else:
|
556
|
-
logger.info(
|
557
|
-
f"file not found: {s3_url}")
|
546
|
+
logger.info(f"file not found: {s3_url}")
|
558
547
|
return None
|